In this document, we try to find the optimal server lookup period with respect to my email distribution.

Loading the data

df <- read.csv(params$filename, header = F)
colnames(df) <- c("timestamp")
df$timestamp <- sort(df$timestamp)
total_emails <- length(df$timestamp)
time_interval <- 24 * 60 * 60
total_days <- as.integer((max(df$timestamp) - min(df$timestamp)) / time_interval)

By looking a the data, I have received 2209 emails over 948 days.

Distance between emails

It is also intersting to look at the interval between two received emails.

df_diff <- diff(df$timestamp)
summary(df_diff)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    3150   12851   37128   47294  659270

What we can see here is that the mean number of seconds between 2 emails is 37128.

The standard deviation of this distance is 66095.

Distribution of number of emails received everyday

We would like to find the distribution of the number of emails I receive everyday.

df$time <- as.POSIXct(df$timestamp, origin="1970-01-01")
df$day <- as.Date(df$time, format="%d/%m/%Y")
df$seconds <- as.integer(difftime(df$time, df$day))
d <- df %>% group_by(day) %>% summarise(nb_mail = n())
nb_days_without_emails <- total_days - nrow(d)

mails_per_day <- d %>% group_by(nb_mail) %>% summarise(count = n())
mails_per_day[nrow(mails_per_day) + 1,] <- c(0, nb_days_without_emails)
mails_per_day <- mails_per_day[order(mails_per_day$nb_mail),]
mails_per_day$density <- mails_per_day$count / sum(mails_per_day$count)
lambda_w <- weighted.mean(mails_per_day$nb_mail, mails_per_day$count)
kable(mails_per_day)

nb_mail	count	density
0	288	0.3037975
1	166	0.1751055
2	134	0.1413502
3	110	0.1160338
4	90	0.0949367
5	49	0.0516878
6	38	0.0400844
7	27	0.0284810
8	24	0.0253165
9	6	0.0063291
10	9	0.0094937
11	3	0.0031646
12	1	0.0010549
13	2	0.0021097
16	1	0.0010549

So, everyday, I receive in average 2.3301688 mails.

We can plot this data:

ggplot(mails_per_day) +
    geom_point(aes(x = nb_mail, y = density)) +
    geom_smooth(aes(x = nb_mail, y = density))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

It makes sense to try to fit an exponential distribution on this data:

lambda <- mails_per_day$density[1]

As a first estimation, we can take the value at 0 to get the parameter of the distribution (\(\lambda = 0.3037975\))

mails_per_day$expo_law <- lambda * exp(- lambda * (mails_per_day$nb_mail))
ggplot(mails_per_day) +
    geom_point(aes(x = nb_mail, y = density)) +
    geom_line(aes(x = nb_mail, y = expo_law))

Skellam Distribution

\(\displaystyle P(X_1 - X_2 = k) = e^{-(\lambda_1 + \lambda_2)}\left(\frac{\lambda_1}{\lambda_2}\right) ^{k/2} I_{|k|}\left(2\sqrt{\lambda_1 \lambda_2}\right)\)

In our case \(X_1, X_2 \rightsquigarrow Poiss(\lambda)\)

Thus:

\(\displaystyle P(X_1 - X_2 = k) = e^{-2\lambda}I_{|k|}\left(2\lambda\right)\)

\(\displaystyle P(|X_1 - X_2| = k) = P(X_1 - X_2 = k) + P(X_2 - X_1 = k) = 2 \times P(X_1 - X_2 = k)\)

where

\(\displaystyle I_{|k|}(x) = \sum _{m=0} ^{\infty} \frac{1}{m!(m + k)!}\left(\frac{x}{2}\right)^{2m+k}\)

Thus in our case:

\(\displaystyle P(|X_1 - X_2| = k) = 2e^{-2\lambda}\sum _{m=0} ^{+\infty} \frac{1}{m!(m + k)!}\left(\frac{2\lambda}{2}\right)^{2m+k}\)

We can simplify a bit:

\(\displaystyle P(|X_1 - X_2| = k) = 2e^{-2\lambda}\sum _{m=0} ^{+\infty} \frac{1}{m!(m + k)!}\lambda^{2m+k}\)

What we want is the esperance of the difference: \(\mathbb{E}[|X_1 - X_2|]\):

\[\begin{equation} \displaystyle \mathbb{E}[|X_1 - X_2|] = \sum_{k=0}^{+\infty} 2 k P(|X_1 - X_2| = k) = 2 \sum_{k=0}^{+\infty} \sum _{m=0} ^{+\infty} \frac{k e^{-2\lambda}}{m!(m + k)!}\lambda^{2m+k} \end{equation}\]

lambda_ske <- lambda_w
sum <- 0
previous <- lambda_ske
current <- 0
k <- 1
diff <- abs(previous - current)
while (!is.na(diff) && diff > 0.0001) {
    previous <- current
    current <- 2 * k * exp(-2 * lambda_ske) * besselI(2 * lambda_ske, k, expon.scaled=FALSE)
    sum <- sum + current
    k <- k + 1
    diff <- abs(previous - current) 
}
sum

## [1] 1.674047

distance_in_secondes <- sum * time_interval
distance_in_secondes

## [1] 144637.7

seconds_to_period(distance_in_secondes)

## [1] "1d 16H 10M 37.6620270991989S"

Email Distribution during the Day

We can plot the time of reception of the emails during the day:

fit <- fitdistr(df$seconds, "normal")
para <- fit$estimate
para

##     mean       sd 
## 42802.32 18138.12

mean <- para[1]
sd <- para[2]
df$dnorm <- dnorm(df$seconds, mean, sd)
ggplot(data = df) +
    geom_histogram(aes(x = seconds, y = ..density..), bins=50) +
    stat_function(fun = dnorm, args = list(mean = mean, sd = sd))

We can see two spikes, they are around 10am and 3pm.

#
# Specify parameters
#
mu <- c(mean, mean)
sigma <- c(sd, sd)

#
# Simulate data
#
n.sim <- 100000
set.seed(17)
x.sim <- matrix(rnorm(n.sim*2, mu, sigma), nrow=2)
x <- abs(x.sim[2, ] - x.sim[1, ])
mean(x)

## [1] 20435.99

#
# Display the results
#
hist(x, freq=FALSE)
f <- function(x, mu, sigma) {
     sqrt(2 / pi) / sigma * cosh(x * mu / sigma^2) * exp(-(x^2 + mu^2)/(2*sigma^2)) 
}
curve(f(x, abs(diff(mu)), sqrt(sum(sigma^2))), lwd=2, col="Red", add=TRUE)

\[ f_{|X_1 - X_2|}(x) = \frac{1}{\sqrt{\pi}\sigma} e ^{-\dfrac{1}{4}\left(\dfrac{x}{\sigma}\right)^{2}} \]

\[ \mathbb{E}[|X_1 - X_2|] = \displaystyle \int _{0} ^{+\infty} x f_{|X_1 - X_2|}(x) dx = \frac{2\sigma}{\sqrt{\pi}} \]

In our case \(\mathbb{E}[|X_1 - X_2|] = 20466.6811136\)

This means that with this daily distribution, the average time between two emails is:

seconds_to_period(as.integer(2 * sd / sqrt(pi)))

## [1] "5H 41M 6S"

Finding Optimal Server Lookup Time

Definition of the Problem

We are trying to minimize the following cost function:

\[ \displaystyle J = \sum _{j = 1} ^{N + 1} \sum _{i = 0} ^{\lfloor\frac{T_j}{k}\rfloor - \lceil\frac{T_{j-1}}{k}\rceil} \left( \left( \lceil\frac{T_{j-1}}{k}\rceil k - T_{j-1}\right) + i k\right)^{2} \]

\[ \displaystyle J = \sum _{j = 1} ^{N + 1} \sum _{i = 0} ^{floor\left(\frac{T_j}{k}\right) - ceil\left(\frac{T_{j-1}}{k}\right)} \left( \left( ceil\left(\frac{T_{j-1}}{k}\right) k - T_{j-1}\right) + i k\right)^{2} \]

Let \(M_j = floor\left(\frac{T_j}{k}\right) - ceil\left(\frac{T_{j-1}}{k}\right)\).

And let \(R_j = ceil\left(\frac{T_{j-1}}{k}\right)k - T_{j-1}\)

We can expand \(J\):

\[ \displaystyle J = \sum _{j=1}^{N+1} \left(\left(M_j + 1\right) R_j^{2} +2 k R_j\frac{M_j(M_j + 1)}{2} +\frac{k^{2}}{6}M_j(M_j + 1)(2M_j + 1)\right) \]

We want to minimize \(J\). So we will find an upper bound of \(J\) and find the value of \(k\) that minimize it.

We remind the reader that:

\[ x - 1 < floor(x) \leq x \] and \[ x \leq ceil(x) < x + 1 \]

Let \(J_1 = \left(M_j + 1\right) R_j^{2}\)

Let \(J_2 = 2 k R_j\frac{M_j(M_j + 1)}{2}\)

Let \(J_3 = \frac{k^{2}}{6}M_j(M_j + 1)(2M_j + 1)\)

Upper Bound of \(J\)

Let us find an upper bound for each of these quantities:

\[ M_j < \frac{1}{k}\left(T_j - T_{j-1}\right) \]

\[ R_j < k \]

\[ J_1 < \left( \frac{T_j}{k} - \frac{T_{j-1}}{k} + 1\right)\left(\left(\frac{T_{j-1}}{k} + 1\right)k - T_{j-1}\right)^2 = k\left(T_j - T_{j-1} + k\right) \]

\[ J_2 < 2k^{2}\frac{\frac{1}{k^2}\left(T_j - T_{j-1}\right)\left(T_j - T_{j-1} + k\right)}{2} = \left(T_j - T_{j-1}\right)^{2} + k\left(T_j - T_{j-1}\right) \]

\[\begin{equation} \begin{split} J_3 &< \frac{k^2}{6}\left(\frac{T_j - T_{j-1}}{k}\right)\left(\frac{T_j - T_{j-1} + k}{k}\right)\left(\frac{2\left(T_j - T_{j-1}\right) + k}{k}\right) \\ &= \frac{1}{6k}\left(T_j - T_{j-1}\right)\left(2\left(T_j - T_{j-1}\right)^{2} + 3k\left(T_j - T_{j-1}\right) + k^2\right) \\ &= \left(T_j - T_{j-1}\right)\left(\frac{1}{3k}\left(T_j - T_{j-1}\right)^2 + \frac{1}{2}\left(T_j - T_{j-1}\right) + \frac{k}{6}\right) \end{split} \end{equation}\]

Let \(\Delta T_j = T_j - T_{j-1}\).

We then have:

\[ J_1 < k\Delta T_j + k^2 \]

\[ J_2 < \Delta T_j ^2 + k \Delta T_j \]

\[ J_3 < \Delta T_j \left( \frac{\Delta T_j ^2}{3k} + \frac{\Delta T_j}{2} + \frac{k}{6}\right) \]

Let us sum everything:

\[\begin{equation} \begin{split} J & < \sum_{j=1}^{N+1} J_1 + J_2 + J_3 \\ & = \sum_{j=1}^{N+1} \left(\frac{1}{3k}\Delta T_j^3 + \frac{3}{2}\Delta T_j^2 + \frac{13k}{6}\Delta T_j + k^2\right) \\ & = U(k) \end{split} \end{equation}\]

We want to minimze, so let us differentiate the upper bound with respect to \(k\):

\[\begin{equation} \begin{split} \frac{dU}{dk}(k) & = \sum_{j=1}^{N+1}\left(\frac{-1}{3k^2}\Delta T_j ^3 + \frac{13}{6}\Delta T_j + 2k\right) \\ & = \frac{13}{6}T_{N+1} + 2(N + 1)k - \frac{1}{3k^2}\sum_{j=1}^{N+1}\Delta T_j ^3 \end{split} \end{equation}\]

Let us solve for \(\frac{dU}{dk}(k) = 0\):

\[\begin{equation} \displaystyle k^3 + \frac{13}{12(N+1)}T_{N+1}k^2 - \frac{1}{6(N+1)}\sum_{j=1}^{N+1}\Delta T_j^3 = 0 \end{equation}\]

Lower Bound of \(J\)

In the same fashion as for finding the upper bound, we get:

\[ \displaystyle \frac{1}{3k}\sum_{j=1}^{N+1} \Delta T_j^3 - \frac{3}{2}\sum_{j=1}^{N+1}\Delta T_j^2 + \frac{13k}{6}T_{N+1} - k^2 < J \]

We can differentiate the lower bound with respect to \(k\):

\[\begin{equation} \displaystyle k^3 - \frac{13}{12(N+1)}T_{N+1}k^2 + \frac{1}{6(N+1)}\sum_{j=1}^{N+1}\Delta T_j^3 = 0 \end{equation}\]

Median value

When looking at the upper and lower bounds, we see that there are some commun terms.

Indeed,

\[\begin{equation} \sum_{j=1}^{N+1} \left(\frac{1}{3k}\Delta T_j^3 - \frac{3}{2}\Delta T_j^2 + \frac{13k}{6}\Delta T_j - k^2\right) < J < \sum_{j=1}^{N+1} \left(\frac{1}{3k}\Delta T_j^3 + \frac{3}{2}\Delta T_j^2 + \frac{13k}{6}\Delta T_j + k^2\right) \end{equation}\]

Thus,

\[\begin{equation} \left \lvert J - \sum_{j=1}^{N+1} \left(\frac{1}{3k}\Delta T_j^3 + \frac{13k}{6}\Delta T_j\right) \right \rvert < \sum_{j=1}^{N+1} \left(\frac{3}{2}\Delta T_j^2 + k^2\right) \end{equation}\]

So \(\sum_{j=1}^{N+1} \left(\frac{1}{3k}\Delta T_j^3 + \frac{13k}{6}\Delta T_j\right)\) seems to be a decent approximation of \(J\).

We can easily find the value of \(k\) minimizing this approximation:

\[ \displaystyle k = \sqrt{\frac{2}{13\Delta T_{N+1}}\sum_{j=1}^{N+1}\Delta T_j^3} \]

Analysis

Let’s take a look at \(\displaystyle \sum_{j=1}^{N+1}\Delta T_j ^3\):

\[\begin{equation} \begin{split} \displaystyle \sum_{j=1}^{N+1}\Delta T_j ^3 &= \sum_{j=1}^{N+1} \left(T_j - T_{j-1}\right)^3 \\ &= \sum_{j=1}^{N+1}\left(T_j^3 - 3T_j^2T_{j-1} + 3T_jT_{j-1}^2 - T_{j-1}^3\right) \\ &= T_{N+1}^3 - T_0^3 - 3\left(T_NT_{N+1}^2 - T_0^2T_1 + \sum_{j=1}^{N+1}T_j^2\left(T_{j-1} - T_{j+1}\right)\right) \\ &= T_{N+1}^3 - 3 \mu T_{N+1}^2 + 3\sum_{j=1}^{N+1}T_j^2\left(T_{j+1} - T_{j-1}\right) \\ &\simeq T_{N+1}^3 - 3 \mu T_{N+1}^2 + 3\frac{\sigma}{\sqrt{\pi}}\sum_{j=1}^{N+1}T_j^2 \\ &= T_{N+1}^3 - 3 \mu T_{N+1}^2 + 3\frac{\sigma}{\sqrt{\pi}}\sum_{j=1}^{N+1}\left(T_j - \mu + \mu\right)^2 \\ &= T_{N+1}^3 - 3 \mu T_{N+1}^2 + 3\frac{\sigma}{\sqrt{\pi}}\sum_{j=1}^{N+1}\left(\left(T_j - \mu\right)^2 + 2\mu\left(T_j - \mu\right) + \mu^2\right) \\ &\simeq T_{N+1}^3 - 3 \mu T_{N+1}^2 + 3\frac{\sigma}{\sqrt{\pi}}\sum_{j=1}^{N+1}\left(\sigma^2 + \mu^2\right) \\ &= T_{N+1}^3 - 3 \mu T_{N+1}^2 + 3\frac{\sigma}{\sqrt{\pi}}\left(N+1\right)\left(\sigma^2 + \mu^2\right) \end{split} \end{equation}\]

N <- 40
sum_cube <- function(N) {
    Ts <- rnorm(N, mean = mean, sd = sd)
    Ts <- append(Ts, 0)
    Ts <- append(Ts, time_interval)
    Ts <- sort(Ts)
    diffs <- diff(Ts)
    diffs <- diffs**3
    sum(diffs)
}

approx_sum_cube <- function(N) {
    iterations = 10000
    data <- seq(1, iterations, 1)
    for (i in 1:iterations) {
        data[i] = sum_cube(N)
    }
    mean(data)
}

find_upper_bound_k <- function(n) {
    tn = time_interval
    coefs <- c(-(1/6)*v_approx_sum_cube[n], 0, 13*tn/12, n + 1)
    results <- polyroot(coefs)
    Re(results[1])
}

find_lower_bound_k <- function(n) {
    tn = time_interval
    coefs <- c((1/6)*v_approx_sum_cube[n], 0, -13*tn/12, n + 1)
    results <- polyroot(coefs)
    Re(results[1])
}

approx_k <- function(n) {
    tn = time_interval
    sqrt(2 * v_approx_sum_cube[n] / (13 * tn))
}

N_max <- 25
v_approx_sum_cube <- seq(1, N_max, 1)
number_of_mails <- seq(1, N_max, 1)
upper_bound_k <- seq(1, N_max, 1)
lower_bound_k <- seq(1, N_max, 1)
approx <- seq(1, N_max, 1)
opti_period <- data.frame(number_of_mails, lower_bound_k, approx, upper_bound_k)
for (i in 1:N_max) {
    v_approx_sum_cube[i] = approx_sum_cube(i)
    opti_period$upper_bound_k[i] <- find_upper_bound_k(i)
    opti_period$lower_bound_k[i] <- find_lower_bound_k(i)
    opti_period$approx[i] <- approx_k(i)
}

opti_period$lower_nb_checks <- time_interval / opti_period$lower_bound_k
opti_period$approx_nb_checks <- time_interval / opti_period$approx
opti_period$upper_nb_checks <- time_interval / opti_period$upper_bound_k


linear_model <- lm(data = opti_period, upper_bound_k ~ I(1/number_of_mails) + number_of_mails)
summary(linear_model)

## 
## Call:
## lm(formula = upper_bound_k ~ I(1/number_of_mails) + number_of_mails, 
##     data = opti_period)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -793.9 -343.7 -129.5  264.5 1051.6 
## 
## Coefficients:
##                      Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)           6339.57     316.77  20.013 0.00000000000000131 ***
## I(1/number_of_mails) 12369.17     628.85  19.670 0.00000000000000189 ***
## number_of_mails       -173.92      17.64  -9.858 0.00000000156603161 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 469.5 on 22 degrees of freedom
## Multiple R-squared:  0.9842, Adjusted R-squared:  0.9827 
## F-statistic: 684.4 on 2 and 22 DF,  p-value: < 0.00000000000000022

ggplot(data = opti_period) +
    geom_point(aes(x = number_of_mails, y = upper_bound_k), color = "red") +
    geom_point(aes(x = number_of_mails, y = approx), color = "green") +
    geom_point(aes(x = number_of_mails, y = lower_bound_k), color = "blue")# +

    # geom_smooth(aes(x = number_of_mails, y = k), method = lm, formula = y ~ I(1/x) + x)

opti_period

But we do not know how many mail we will receive each day.

But we know the distribution of the number of emails.

Let us compute the weighted mean for the value of \(k\):

\[ k = \sum_{n=0}^{+\infty}k(n) \times P(N = n) \]

where \(N \rightsquigarrow Exp(\lambda)\).

opti_period$w <- opti_period$upper_bound_k * lambda * exp(- lambda * (opti_period$number_of_mails))
mean_k <- sum(opti_period$w)

From the estimated value of \(\lambda\) seen above, we get that \(k = 10034.3608886\).

This means a total of 8.6104139 checks everyday.

PID Controller

The goal is now to design a PID Controller able to regulate the time period between two server lookup.

The idea is the following: if I have no mail, I increase the time interval, if I just received a mail, then I decrease the time interval.

The expression of a PID Controller is as follow:

\[\begin{equation} u(n+1) = u(n) + k_p e(n) + k_i \sum_{m=0}^{n}\frac{e(m)}{\Delta t} + k_d \frac{e(n) - e(n-1)}{\Delta T} \end{equation}\]

\(e(n)\) is the error at step \(n\) and \(u(n)\) is the time interval at step \(n\).

We will define the error as the time since the last email received.

The sign of the error is positive if we did not receive a mail since our last check, negative otherwise.

Simulator

pid_inc <- function(kp, ki, kd, error, cumulated_error, previous_error) {
    inc_p <- kp * error
    inc_i <- ki * cumulated_error
    inc_d <- kd * (error - previous_error)

    inc_p + inc_i + inc_d
}

bound <- function(x, mini, maxi) {
    if (x > maxi) {
        maxi
    } else if (x < mini) {
        mini
    } else {
        x
    }
}

generate_emails_time <- function(lambda, mu, sigma) {
    nb_emails <- as.integer(rexp(1, rate = lambda))
    emails <- sort(rnorm(nb_emails, mean = mu, sd = sigma))
    emails
}

simulate <- function(kp, ki, kd, min_value, max_value, emails) {
    sleep_time <- min_value

    cumulated_error <- 0
    previous_error <- 0
    error <- 0
    current_time <- 0
    timestamp_last_email_received <- 0
    nb_requests <- 0

    while (current_time < 24 * 60 * 60) {
        previous_error <- error
        current_time <- current_time + sleep_time
        new_emails <- c()
        for (email_time in emails) {
            if (email_time <= current_time &&
                email_time >  current_time - sleep_time) {
                new_emails <- append(new_emails, email_time)
            }
        }
        nb_new_emails <- length(new_emails)
        nb_requests <- nb_requests + 1
        if (nb_new_emails == 0) {
            error <- current_time - timestamp_last_email_received
        } else {
            error <- new_emails[1] - current_time
            timestamp_last_email_received <- new_emails[nb_new_emails]
        }
        cumulated_error <- cumulated_error + error
        sleep_time <- sleep_time + pid_inc(kp, ki, kd, error, cumulated_error, previous_error)
        sleep_time <- bound(sleep_time, min_value, max_value)
    }
    c(cumulated_error, nb_requests)
}

emails <- generate_emails_time(lambda, mean, sd)
emails

## [1] 23614.39 52250.43

simulate(0.1, 0.1, 0.1, 5 * 60, 4 * 60 * 60, emails)

## [1] 124757     13

Experiments

# To run the experiments
# python simulator.py [lambda] [mu] [sigma] [k_opti] [filename]

We will run the experiments with:

\(\lambda = 0.3037975\)
\(\mu = 42802.3173382\)
\(\sigma = 18138.1238776\)
\(k = 10034.3608886\)

data_expe <- read.csv("data_experiment.csv", header = F, sep = ",")
colnames(data_expe) <- c("kp", "ki", "kd", "min_time", "max_time", "error_pid", "req_pid", "error_k", "req_k")
head(data_expe)

df_pid <- data_expe %>% group_by(kp, ki, kd) %>% summarise(error_pid = mean(error_pid),
                                                           #error_pid_sd = sd(error_pid),
                                                           req_pid = mean(req_pid),
                                                           #req_pid_sd = sd(req_pid),
                                                           error_k = mean(error_k),
                                                           #error_k_sd = sd(error_k),
                                                           req_k = mean(req_k))
                                                           #req_k_sd = sd(req_k))
df_speedup <- df_pid %>% group_by(kp, ki, kd) %>% summarise(speedup_error = error_k / error_pid,
                                                            speedup_req = req_k / req_pid)
head(df_speedup)

Less Requests

tail(df_speedup[order(df_speedup$speedup_req),])

Less Error

tail(df_speedup[order(df_speedup$speedup_error),])

Email Analysis