data.table - row-wise operations

Setup

This section contains the setup and the various utility functions used throughout.

Libraries used:

library(data.table)
library(Rfast)

Introduction

Take \(n\) samples from \(Y \sim \mathsf{Normal}(10, 2^2)\).

set.seed(2)
n <- 10
y <- rnorm(n, 10, 2)

Consider computing row-wise operations on this data across some grid with \(N\) rows.

grid <- data.table(
  expand.grid(mu = c(9, 10, 11),
              sigma = c(1, 2))
)
grid[, id := 1:.N]
setkey(grid, id)
grid

##    mu sigma id
## 1:  9     1  1
## 2: 10     1  2
## 3: 11     1  3
## 4:  9     2  4
## 5: 10     2  5
## 6: 11     2  6

For example, perhaps we want the log-likelihood using the Normal density function.

loglik1 <- function(y, mu = 0, sigma = 1){
  log(1/(sqrt(2*pi)*sigma)) - (1/2) * ((y - mu)/sigma)^2
}

Method 1

Traverse each row \(i = \{1, 2, \dots, N\}\) of the grid and compute the log likelihood for the observed data \(\pmb{y}\) at the current parameter values \(\mu_i\) and \(\sigma_i\)

l1 <- sapply(1:nrow(grid), function(i){
  sum(loglik1(y, mu = grid[i, mu], sigma = grid[i, sigma]))
})
grid[, l1 := l1]

Method 2

Another iterative approach that is functionally equivalent to first approach.

wraploglik <- function(dd){
  sum(loglik1(y, dd$mu, dd$sigma))
}
grid[, l2 := wraploglik(.SD), by = seq_len(nrow(grid))]

Method 3

Transform the process into a series of matrix manipulations.

Step 1

Let \(\pmb{\mu}\) be the vector of means of length \(N\) defined by the grid and \(\pmb{1}_n\) be a vector of \(1\)’s having length \(n\); the same length as \(\pmb{y}\). Form the \(n \times N\) outer-product matrix from \(\pmb{P} = \pmb{1}_n \pmb{\mu}^T\) using the crossprod function, which is equivalent to x %*% t(y). In R:

m_grid <- as.matrix(grid[, .(mu)])
m_x <- matrix(rep(1, length(y)), ncol = 1)
# The column for mu in grid becomes a row for each value of y
tcrossprod(m_x, m_grid)

##       [,1] [,2] [,3] [,4] [,5] [,6]
##  [1,]    9   10   11    9   10   11
##  [2,]    9   10   11    9   10   11
##  [3,]    9   10   11    9   10   11
##  [4,]    9   10   11    9   10   11
##  [5,]    9   10   11    9   10   11
##  [6,]    9   10   11    9   10   11
##  [7,]    9   10   11    9   10   11
##  [8,]    9   10   11    9   10   11
##  [9,]    9   10   11    9   10   11
## [10,]    9   10   11    9   10   11

Step 2

Build a diagonal matrix from the reciprocal of each element from the vector of standard deviations \(\pmb{\sigma}\), also from the grid.

diag(1/grid$sigma)

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    0    0  0.0  0.0  0.0
## [2,]    0    1    0  0.0  0.0  0.0
## [3,]    0    0    1  0.0  0.0  0.0
## [4,]    0    0    0  0.5  0.0  0.0
## [5,]    0    0    0  0.0  0.5  0.0
## [6,]    0    0    0  0.0  0.0  0.5

When we matrix multiply by the reciprocal of \(\sigma_i\), it is equivalent to the division by the standard deviations.

Step 3

Compute the first component of the log likelihood:

\[ \begin{equation} \begin{split} \pmb{D} = -\frac{1}{2} \biggl[ [\pmb{y} \pmb{1}_N - \pmb{P}] \mathsf{diag}\biggl( \frac{1}{\sigma_i} \biggr) \biggr]^2 \end{split} \tag{1} \end{equation} \]

If done verbatim as follows, the code involves a matrix multiplication with the diagonal matrix in order to divide by \(\sigma_i\), which is very computationally expensive:

(D <- -(1/2)*(  (y - tcrossprod(m_x, m_grid)) %*% diag(1/grid$sigma)  )^2)

##              [,1]        [,2]        [,3]        [,4]         [,5]        [,6]
##  [1,]  -0.3150823 -1.60891141 -3.90274050 -0.07877058 -0.402227852 -0.97568513
##  [2,]  -0.9380368 -0.06833844 -0.19864007 -0.23450920 -0.017084611 -0.04966002
##  [3,]  -8.7181963 -5.04250559 -2.36681493 -2.17954906 -1.260626398 -0.59170373
##  [4,]  -0.7947470 -2.55549833 -5.31624968 -0.19868675 -0.638874582 -1.32906242
##  [5,]  -0.3523772 -0.01288069 -0.67338420 -0.08809429 -0.003220172 -0.16834605
##  [6,]  -0.7999108 -0.03507026 -0.27022969 -0.19997771 -0.008767566 -0.06755742
##  [7,]  -2.9183093 -1.00239980 -0.08649034 -0.72957731 -0.250599949 -0.02162258
##  [8,]  -0.1355142 -0.11491029 -1.09430633 -0.03387856 -0.028727571 -0.27357658
##  [9,] -12.3452215 -7.87627361 -4.40732574 -3.08630537 -1.969068403 -1.10183143
## [10,]  -0.2609496 -0.03852367 -0.81609769 -0.06523741 -0.009630917 -0.20402442

A better approach is to use sweep which will sweep a summary over the extents of an input array. For example, if we have matrix:

matrix(1:12, nrow = 3, byrow = T)

##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    5    6    7    8
## [3,]    9   10   11   12

and wanted to divide each column by the value in a vector indexed by the respective column, we could do:

sweep(matrix(1:12, nrow = 3, byrow = T), 2, 1:4, FUN = "/")

##      [,1] [,2]     [,3] [,4]
## [1,]    1    1 1.000000    1
## [2,]    5    3 2.333333    2
## [3,]    9    5 3.666667    3

So, to divide the matrix formed from y and the outer-product by each of the grids values for \(\sigma_i\), we can do

(D <- -(1/2)*(sweep((y - tcrossprod(m_x, m_grid)), 2, grid$sigma, FUN = "/"))^2)

##              [,1]        [,2]        [,3]        [,4]         [,5]        [,6]
##  [1,]  -0.3150823 -1.60891141 -3.90274050 -0.07877058 -0.402227852 -0.97568513
##  [2,]  -0.9380368 -0.06833844 -0.19864007 -0.23450920 -0.017084611 -0.04966002
##  [3,]  -8.7181963 -5.04250559 -2.36681493 -2.17954906 -1.260626398 -0.59170373
##  [4,]  -0.7947470 -2.55549833 -5.31624968 -0.19868675 -0.638874582 -1.32906242
##  [5,]  -0.3523772 -0.01288069 -0.67338420 -0.08809429 -0.003220172 -0.16834605
##  [6,]  -0.7999108 -0.03507026 -0.27022969 -0.19997771 -0.008767566 -0.06755742
##  [7,]  -2.9183093 -1.00239980 -0.08649034 -0.72957731 -0.250599949 -0.02162258
##  [8,]  -0.1355142 -0.11491029 -1.09430633 -0.03387856 -0.028727571 -0.27357658
##  [9,] -12.3452215 -7.87627361 -4.40732574 -3.08630537 -1.969068403 -1.10183143
## [10,]  -0.2609496 -0.03852367 -0.81609769 -0.06523741 -0.009630917 -0.20402442

Step 4

Compute the second component of the log likelihood:

\[ \begin{equation} \begin{split} \pmb{S} = \log{\biggl( \frac{1}{ \sqrt{2\pi} \sigma_i} \biggr)} \end{split} \tag{2} \end{equation} \]

S <- log(1/(sqrt(2*pi) * grid$sigma ))
S

## [1] -0.9189385 -0.9189385 -0.9189385 -1.6120857 -1.6120857 -1.6120857

Step 5

Use the sweep function again to subtract each column from each element in \(\pmb{S}\).

sweep(D, 2, S, FUN = "+")

##             [,1]       [,2]      [,3]      [,4]      [,5]      [,6]
##  [1,]  -1.234021 -2.5278499 -4.821679 -1.690856 -2.014314 -2.587771
##  [2,]  -1.856975 -0.9872770 -1.117579 -1.846595 -1.629170 -1.661746
##  [3,]  -9.637135 -5.9614441 -3.285753 -3.791635 -2.872712 -2.203789
##  [4,]  -1.713686 -3.4744369 -6.235188 -1.810772 -2.250960 -2.941148
##  [5,]  -1.271316 -0.9318192 -1.592323 -1.700180 -1.615306 -1.780432
##  [6,]  -1.718849 -0.9540088 -1.189168 -1.812063 -1.620853 -1.679643
##  [7,]  -3.837248 -1.9213383 -1.005429 -2.341663 -1.862686 -1.633708
##  [8,]  -1.054453 -1.0338488 -2.013245 -1.645964 -1.640813 -1.885662
##  [9,] -13.264160 -8.7952121 -5.326264 -4.698391 -3.581154 -2.713917
## [10,]  -1.179888 -0.9574622 -1.735036 -1.677323 -1.621717 -1.816110

from which the sum of each column gives the log likelihood for the data based on each \((\mu_i, \sigma_i)\) pair.

We can bundle all these steps into a new log likelihood function.

loglik2 <- function(y, grid){
  m_grid <- as.matrix(grid[, .(mu)])
  m_x <- matrix(rep(1, length(y)), ncol = 1)
  
  D <- -(1/2)*(sweep((y - Rfast::Tcrossprod(m_x, m_grid)), 2, grid$sigma, FUN = "/"))^2
  S <- log(1/(sqrt(2*pi) * grid$sigma ))

  colSums(sweep(D, 2, S, FUN = "+"))
}  
grid[, l3 := loglik2(y, .SD)]

from which we should have the same values for the log likelihood under each method:

grid

##    mu sigma id        l1        l2        l3
## 1:  9     1  1 -36.76773 -36.76773 -36.76773
## 2: 10     1  2 -27.54470 -27.54470 -27.54470
## 3: 11     1  3 -28.32166 -28.32166 -28.32166
## 4:  9     2  4 -23.01544 -23.01544 -23.01544
## 5: 10     2  5 -20.70969 -20.70969 -20.70969
## 6: 11     2  6 -20.90393 -20.90393 -20.90393

Speed

Function for simulating data

set.seed(1)
getdata <- function(n = 100, mu = 10, sigma = 2){
  y <- array(rnorm(n, mu, sigma), dim = c(n, 1))
  grid <- data.table(
    expand.grid(mu = seq(mu - 3*sigma, mu + 3*sigma, len = 100),
                sigma = seq(0, sigma * 2, len = 100))
  )
  grid[, id := 1:.N]
  setkey(grid, id)
  
  return(list(y = y, grid = grid))
}

Time method 1, the iterative approach.

l <- getdata()
start <- Sys.time()
l1 <- sapply(1:nrow(l$grid), function(i){
  sum(loglik1(l$y, mu = l$grid[i, mu], sigma = l$grid[i, sigma]))
})
grid[, l1 := l1]
difftime(Sys.time(), start)

## Time difference of 4.024186 secs

Time method 3 using linear algebra and vectorisation to speed up the process.

start <- Sys.time()
grid[, l3 := loglik2(y, .SD)]
difftime(Sys.time(), start)

## Time difference of 0.002532482 secs

And ensure that the results are the same

identical(grid$l1, grid$l3)

## [1] TRUE

Summary

The R data.table package generally performs very well, but non-vectorised row-wise operations can be slow unless implemented with thought. Three methods for row-wise operations were reviewed with the vectorised approach being significantly faster than the others.