This section contains the setup and the various utility functions used throughout.
Libraries used:
library(data.table)
library(Rfast)
Take \(n\) samples from \(Y \sim \mathsf{Normal}(10, 2^2)\).
set.seed(2)
n <- 10
y <- rnorm(n, 10, 2)
Consider computing row-wise operations on this data across some grid with \(N\) rows.
grid <- data.table(
expand.grid(mu = c(9, 10, 11),
sigma = c(1, 2))
)
grid[, id := 1:.N]
setkey(grid, id)
grid
## mu sigma id
## 1: 9 1 1
## 2: 10 1 2
## 3: 11 1 3
## 4: 9 2 4
## 5: 10 2 5
## 6: 11 2 6
For example, perhaps we want the log-likelihood using the Normal density function.
loglik1 <- function(y, mu = 0, sigma = 1){
log(1/(sqrt(2*pi)*sigma)) - (1/2) * ((y - mu)/sigma)^2
}
Traverse each row \(i = \{1, 2, \dots, N\}\) of the grid and compute the log likelihood for the observed data \(\pmb{y}\) at the current parameter values \(\mu_i\) and \(\sigma_i\)
l1 <- sapply(1:nrow(grid), function(i){
sum(loglik1(y, mu = grid[i, mu], sigma = grid[i, sigma]))
})
grid[, l1 := l1]
Another iterative approach that is functionally equivalent to first approach.
wraploglik <- function(dd){
sum(loglik1(y, dd$mu, dd$sigma))
}
grid[, l2 := wraploglik(.SD), by = seq_len(nrow(grid))]
Transform the process into a series of matrix manipulations.
Step 1
Let \(\pmb{\mu}\) be the vector of means of length \(N\) defined by the grid and \(\pmb{1}_n\) be a vector of \(1\)’s having length \(n\); the same length as \(\pmb{y}\).
Form the \(n \times N\) outer-product matrix from \(\pmb{P} = \pmb{1}_n \pmb{\mu}^T\) using the crossprod
function, which is equivalent to x %*% t(y)
.
In R:
m_grid <- as.matrix(grid[, .(mu)])
m_x <- matrix(rep(1, length(y)), ncol = 1)
# The column for mu in grid becomes a row for each value of y
tcrossprod(m_x, m_grid)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 9 10 11 9 10 11
## [2,] 9 10 11 9 10 11
## [3,] 9 10 11 9 10 11
## [4,] 9 10 11 9 10 11
## [5,] 9 10 11 9 10 11
## [6,] 9 10 11 9 10 11
## [7,] 9 10 11 9 10 11
## [8,] 9 10 11 9 10 11
## [9,] 9 10 11 9 10 11
## [10,] 9 10 11 9 10 11
Step 2
Build a diagonal matrix from the reciprocal of each element from the vector of standard deviations \(\pmb{\sigma}\), also from the grid.
diag(1/grid$sigma)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 0 0 0.0 0.0 0.0
## [2,] 0 1 0 0.0 0.0 0.0
## [3,] 0 0 1 0.0 0.0 0.0
## [4,] 0 0 0 0.5 0.0 0.0
## [5,] 0 0 0 0.0 0.5 0.0
## [6,] 0 0 0 0.0 0.0 0.5
When we matrix multiply by the reciprocal of \(\sigma_i\), it is equivalent to the division by the standard deviations.
Step 3
Compute the first component of the log likelihood:
\[ \begin{equation} \begin{split} \pmb{D} = -\frac{1}{2} \biggl[ [\pmb{y} \pmb{1}_N - \pmb{P}] \mathsf{diag}\biggl( \frac{1}{\sigma_i} \biggr) \biggr]^2 \end{split} \tag{1} \end{equation} \]
If done verbatim as follows, the code involves a matrix multiplication with the diagonal matrix in order to divide by \(\sigma_i\), which is very computationally expensive:
(D <- -(1/2)*( (y - tcrossprod(m_x, m_grid)) %*% diag(1/grid$sigma) )^2)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.3150823 -1.60891141 -3.90274050 -0.07877058 -0.402227852 -0.97568513
## [2,] -0.9380368 -0.06833844 -0.19864007 -0.23450920 -0.017084611 -0.04966002
## [3,] -8.7181963 -5.04250559 -2.36681493 -2.17954906 -1.260626398 -0.59170373
## [4,] -0.7947470 -2.55549833 -5.31624968 -0.19868675 -0.638874582 -1.32906242
## [5,] -0.3523772 -0.01288069 -0.67338420 -0.08809429 -0.003220172 -0.16834605
## [6,] -0.7999108 -0.03507026 -0.27022969 -0.19997771 -0.008767566 -0.06755742
## [7,] -2.9183093 -1.00239980 -0.08649034 -0.72957731 -0.250599949 -0.02162258
## [8,] -0.1355142 -0.11491029 -1.09430633 -0.03387856 -0.028727571 -0.27357658
## [9,] -12.3452215 -7.87627361 -4.40732574 -3.08630537 -1.969068403 -1.10183143
## [10,] -0.2609496 -0.03852367 -0.81609769 -0.06523741 -0.009630917 -0.20402442
A better approach is to use sweep
which will sweep a summary over the extents of an input array. For example, if we have matrix:
matrix(1:12, nrow = 3, byrow = T)
## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 4
## [2,] 5 6 7 8
## [3,] 9 10 11 12
and wanted to divide each column by the value in a vector indexed by the respective column, we could do:
sweep(matrix(1:12, nrow = 3, byrow = T), 2, 1:4, FUN = "/")
## [,1] [,2] [,3] [,4]
## [1,] 1 1 1.000000 1
## [2,] 5 3 2.333333 2
## [3,] 9 5 3.666667 3
So, to divide the matrix formed from y and the outer-product by each of the grids values for \(\sigma_i\), we can do
(D <- -(1/2)*(sweep((y - tcrossprod(m_x, m_grid)), 2, grid$sigma, FUN = "/"))^2)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.3150823 -1.60891141 -3.90274050 -0.07877058 -0.402227852 -0.97568513
## [2,] -0.9380368 -0.06833844 -0.19864007 -0.23450920 -0.017084611 -0.04966002
## [3,] -8.7181963 -5.04250559 -2.36681493 -2.17954906 -1.260626398 -0.59170373
## [4,] -0.7947470 -2.55549833 -5.31624968 -0.19868675 -0.638874582 -1.32906242
## [5,] -0.3523772 -0.01288069 -0.67338420 -0.08809429 -0.003220172 -0.16834605
## [6,] -0.7999108 -0.03507026 -0.27022969 -0.19997771 -0.008767566 -0.06755742
## [7,] -2.9183093 -1.00239980 -0.08649034 -0.72957731 -0.250599949 -0.02162258
## [8,] -0.1355142 -0.11491029 -1.09430633 -0.03387856 -0.028727571 -0.27357658
## [9,] -12.3452215 -7.87627361 -4.40732574 -3.08630537 -1.969068403 -1.10183143
## [10,] -0.2609496 -0.03852367 -0.81609769 -0.06523741 -0.009630917 -0.20402442
Step 4
Compute the second component of the log likelihood:
\[ \begin{equation} \begin{split} \pmb{S} = \log{\biggl( \frac{1}{ \sqrt{2\pi} \sigma_i} \biggr)} \end{split} \tag{2} \end{equation} \]
S <- log(1/(sqrt(2*pi) * grid$sigma ))
S
## [1] -0.9189385 -0.9189385 -0.9189385 -1.6120857 -1.6120857 -1.6120857
Step 5
Use the sweep function again to subtract each column from each element in \(\pmb{S}\).
sweep(D, 2, S, FUN = "+")
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -1.234021 -2.5278499 -4.821679 -1.690856 -2.014314 -2.587771
## [2,] -1.856975 -0.9872770 -1.117579 -1.846595 -1.629170 -1.661746
## [3,] -9.637135 -5.9614441 -3.285753 -3.791635 -2.872712 -2.203789
## [4,] -1.713686 -3.4744369 -6.235188 -1.810772 -2.250960 -2.941148
## [5,] -1.271316 -0.9318192 -1.592323 -1.700180 -1.615306 -1.780432
## [6,] -1.718849 -0.9540088 -1.189168 -1.812063 -1.620853 -1.679643
## [7,] -3.837248 -1.9213383 -1.005429 -2.341663 -1.862686 -1.633708
## [8,] -1.054453 -1.0338488 -2.013245 -1.645964 -1.640813 -1.885662
## [9,] -13.264160 -8.7952121 -5.326264 -4.698391 -3.581154 -2.713917
## [10,] -1.179888 -0.9574622 -1.735036 -1.677323 -1.621717 -1.816110
from which the sum of each column gives the log likelihood for the data based on each \((\mu_i, \sigma_i)\) pair.
We can bundle all these steps into a new log likelihood function.
loglik2 <- function(y, grid){
m_grid <- as.matrix(grid[, .(mu)])
m_x <- matrix(rep(1, length(y)), ncol = 1)
D <- -(1/2)*(sweep((y - Rfast::Tcrossprod(m_x, m_grid)), 2, grid$sigma, FUN = "/"))^2
S <- log(1/(sqrt(2*pi) * grid$sigma ))
colSums(sweep(D, 2, S, FUN = "+"))
}
grid[, l3 := loglik2(y, .SD)]
from which we should have the same values for the log likelihood under each method:
grid
## mu sigma id l1 l2 l3
## 1: 9 1 1 -36.76773 -36.76773 -36.76773
## 2: 10 1 2 -27.54470 -27.54470 -27.54470
## 3: 11 1 3 -28.32166 -28.32166 -28.32166
## 4: 9 2 4 -23.01544 -23.01544 -23.01544
## 5: 10 2 5 -20.70969 -20.70969 -20.70969
## 6: 11 2 6 -20.90393 -20.90393 -20.90393
Function for simulating data
set.seed(1)
getdata <- function(n = 100, mu = 10, sigma = 2){
y <- array(rnorm(n, mu, sigma), dim = c(n, 1))
grid <- data.table(
expand.grid(mu = seq(mu - 3*sigma, mu + 3*sigma, len = 100),
sigma = seq(0, sigma * 2, len = 100))
)
grid[, id := 1:.N]
setkey(grid, id)
return(list(y = y, grid = grid))
}
Time method 1, the iterative approach.
l <- getdata()
start <- Sys.time()
l1 <- sapply(1:nrow(l$grid), function(i){
sum(loglik1(l$y, mu = l$grid[i, mu], sigma = l$grid[i, sigma]))
})
grid[, l1 := l1]
difftime(Sys.time(), start)
## Time difference of 4.024186 secs
Time method 3 using linear algebra and vectorisation to speed up the process.
start <- Sys.time()
grid[, l3 := loglik2(y, .SD)]
difftime(Sys.time(), start)
## Time difference of 0.002532482 secs
And ensure that the results are the same
identical(grid$l1, grid$l3)
## [1] TRUE
The R data.table
package generally performs very well, but non-vectorised row-wise operations can be slow unless implemented with thought.
Three methods for row-wise operations were reviewed with the vectorised approach being significantly faster than the others.