Deep Learning with R – Chapter 3

knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE,
  echo = TRUE, 
  eval = FALSE,
  dpi = 300, 
  tidy = "styler", 
  fig.width = 8, 
  fig.height = 5
)

Packages

library(keras3)
library(tensorflow)

Find out the TensorFlow version on the machine.

tensorflow::tf_config()

3.6 TensorFlow tensors

r_array <- array(1:6, c(2, 3))

tf_tensor <- as_tensor(r_array)

tf_tensor

dim(tf_tensor)
length(tf_tensor)

rank (ndim) = how many axis
shape = how many elements in each axis
type (dtype) = the range of values and the memory footprint

# built-in math
tf_tensor + tf_tensor

We can often write the same code for TensorFlow tensors as R arrays.

3.7 Tensor attributes

tf_tensor$ndim

length(dim(tf_tensor)) == tf_tensor$ndim

ndim = rank of the tensor = number of axis

# scaler, no axis, rank = 0
as_tensor(1)$ndim

# vector, 1 axis, rank = 1
as_tensor(1:10)$ndim

# matrix, 2 axis, rank = 2
tf_tensor$ndim

tf_tensor$shape

shape = how many elements in each axis
TensorShape([2, 3]) means: 2 elements in first axis, 3 elements in second axis

# create TensorShape object with shape() function
shape(2, 3)

# dtype: data type
tf_tensor$dtype

In R, the default floating numeric datatype, double, is coverted to tf.float64

r_array <- array(1)
r_array
typeof(r_array)

as_tensor(r_array)$dtype

# in the book, we use `float32` as default floating point datatype
as_tensor(r_array, dtype = "float32")

Tensor shape and reshaping

# make an array of 0
as_tensor(0, shape = c(2, 3))

# make R vector to tensor
as_tensor(1:6, shape = c(2, 3))

tensor vs. R-array:

TensorFlow tensors use: row-major ordering = C ordering
R arrays use: column-major ordering = Fortran ordering

# explicit about reshaping behavior

# C ordering
array_reshape(1:6, dim = c(2, 3), order = "C")

# Fortran ordering
array_reshape(1:6, dim = c(2, 3), order = "F")

array_reshape() and as_tensor() can automatically infer size of unspecified axes. One can use -1 or NA for the unspecified axis

array_reshape(1:6, c(-1, 2))
array_reshape(1:6, c(NA, 2))

as_tensor(1:6, shape = c(2, NA))
as_tensor(1:6, shape = c(2, -1))

Tensor slicing

Tensors allow us to get “the rest of the tensor in that direction”

train_images <- as_tensor(dataset_mnist()$train$x)
train_images$shape

my_slice <- train_images[, 15:NA, 15:NA]
my_slice <- train_images[, 8:-8, 8:-8]
my_slice <- train_images[1:100, all_dims()]

Tensor broadcasting

Broadcasting is performed when we have an operation on two different-sized tensors, and we want the smaller tensor to be broadcast to match the shape of the larger tensor.

x <- as_tensor(1, shape = c(64, 3, 32, 10))
y <- as_tensor(2, shape = c(32, 10))
z <- x + y
z

The tf module

How to create tensors?

We can use as_tensor to create tensors. Other ways?

library(tensorflow)

# all-ones tensor
tf$ones(shape(1, 3))

# all-zero tensor
tf$zeros(shape(1, 3))

# tensors of Gaussian random variable
tf$random$normal(shape(1, 3), mean = 0, stddev = 1)

Be careful of R syntax

tf$ones(c(2, 1)) # wrong

tf$ones(c(2L, 1L)) # right way

tf module uses a 0-based index R uses a 1-based index

m <- as_tensor(1:12, shape = c(3, 4))
m

# take column mean, tf uses 0-based index
tf$reduce_mean(m, axis = 0L, keepdims = TRUE)

# take column mean, R uses 1-based index
mean(m, axis = 1, keepdims = TRUE)

Constant tensors and variables

Note: tensors are not modifiable! They are constant!

# in R, array is modifiable
x <- array(1, dim = c(2, 2))
x[1, 1] <- 0
x

# tensor is not modifiable
x <- as_tensor(1, shape = c(2, 2))
x[1, 1] <- 0 # error

If we need to update/change states of tensor, we need variables

# create a TensorFlow variable
v <- tf$Variable(initial_value = tf$random$normal(shape(3, 1)))
v

# assign value to a variable
v$assign(tf$ones(shape(3, 1)))
v

# assign_add() means x <- x + value
v$assign_add(tf$ones(shape(3, 1)))

# assign_sub() means x <- x - value
v$assign_sub(tf$ones(shape(3, 1)))

Tensor operations: Doing math in TensorFlow

a <- tf$ones(c(2L, 2L))
a

a <- as_tensor(c(4, 4, 4, 4), shape = c(2, 2))
a

b <- tf$square(a)
b

c <- tf$sqrt(a)
c

d <- b + c # add two tensors (element-wise)
d

e <- tf$matmul(a, b) # tensor product, or dot product
e

e <- e * d # multiple two tensors (element-wise)
e

GraidentTape API

R cannot retrieve the gradient of the differentiable expression w.r.t. any of its input.

# using the GradientTape
input_var <- tf$Variable(initial_value = 3)

with(tf$GradientTape() %as% tape, {
  result <- tf$square(input_var)
})

gradient <- tape$gradient(result, input_var)

Why care?

This is most commonly used to retrieve the gradients of the loss of a model w.r.t. its weights

# gradient <- tape$gradient(loss, weights)

Note: only trainable variables are tracked by default

With a constant tensor, we need to manually mark it as being tracked by calling tape$watch()

# using GradientTape with constant tensor inputs

input_const <- as_tensor(3)

with(tf$GradientTape() %as% tape, {
  tape$watch(input_const) # manually mark it as being tracked
  result <- tf$square(input_const)
})

gradient <- tape$gradient(result, input_const)

gradient

Compute second-order gradients

# using nested gradient tapes to compute second-order gradients

time <- tf$Variable(0)

with(tf$GradientTape() %as% outer_tape, {
  with(tf$GradientTape() %as% inner_tape, {
    position <- 4.9 * time^2
  })
  speed <- inner_tape$gradient(position, time)
})

acceleration <- outer_tape$gradient(speed, time)

acceleration

Example: A linear classifier in pure TensorFlow

Generate two class of points by drawing their coordinates from a random distribution with a covariance matrix and a mean
The covariance matrix describes the shape of the point cloud, and the mean describes its position in the plane

# generating two classes of points
num_samples_per_class <- 1000

# covariance matrix
Sigma <- rbind(
  c(1, 0.5),
  c(0.5, 1)
)

negative_samples <- MASS::mvrnorm(
  n = num_samples_per_class,
  mu = c(0, 3),
  Sigma = Sigma
)

positive_samples <- MASS::mvrnorm(
  n = num_samples_per_class,
  mu = c(3, 0),
  Sigma = Sigma
)

# stacking them into an array
inputs <- rbind(negative_samples, positive_samples)

# generating the corresponding targets (0, 1)
targets <- rbind(
  array(0, dim = c(num_samples_per_class, 1)),
  array(1, dim = c(num_samples_per_class, 1))
)

plot the data

plot(
  x = inputs[, 1],
  y = inputs[, 2],
  col = ifelse(targets[, 1] == 0, "purple", "green")
)

Create a linear classifier to separate these two classes of points:

linear classifier is a affine transformation, $\text{prediction} = W \cdot \text{input} + b$, trained to minimize the square different between predictions and targets

# creating the linear classifier variables

# 1. specify input dim: inputs will be 2D points
input_dim <- 2

# 2. specify output dim: output predictions will be a single score per sample
output_dim <- 1

# 3. create W and b and initialize
W <- tf$Variable(
  initial_value = tf$random$uniform(shape(input_dim, output_dim))
)

W

b <- tf$Variable(
  initial_value = tf$zeros(shape(output_dim))
)

b

Specify model and loss function

# the forward pass function
model <- function(inputs) {
  tf$matmul(inputs, W) + b
}

# the mean square loss function

# R style
square_loss <- function(targets, predictions) {
  per_sample_losses <- (targets - predictions)^2
  mean(per_sample_losses)
}

# tf style
square_loss <- function(targets, predictions) {
  per_sample_losses <- tf$square(tf$subtract(targets, predictions))
  tf$reduce_mean(per_sample_losses)
}

training

# the training step function
learning_rate <- 0.1

# train step contains:
# 1. gradient computation
# 2. weight update

training_step <- function(inputs, targets) {
  # forward pass, inside a gradient tape scope
  with(tf$GradientTape() %as% tape, {
    predictions <- model(inputs)
    loss <- square_loss(predictions, targets)
  })

  # retrieve the gradient of the loss with regard to weights
  grad_loss_wrt <- tape$gradient(loss, list(W = W, b = b))

  # update weights
  W$assign_sub(grad_loss_wrt$W * learning_rate)
  b$assign_sub(grad_loss_wrt$b * learning_rate)
  loss
}

Above is batch training instead of mini-batch training

For batch training, in each train step (i.e., compute gradient and update the weights), we will use ALL data, rather than using small batches of data

# the batch training loop

inputs <- as_tensor(inputs, dtype = "float32")

# using 40 steps
for (step in seq(40)) {
  loss <- training_step(inputs, targets)
  cat(sprintf("Loss at step %s: %.4f\n", step, loss))
}

predictions <- model(inputs)

# convert tensors to R arrays for plotting
inputs <- as.array(inputs)
predictions <- as.array(predictions)

plot(
  inputs[, 1],
  inputs[, 2],
  col = ifelse(predictions[, 1] <= 0.5, "purple", "green")
)

# plot the separating line
slope <- -W[1, ] / W[2, ]
intercept <- (.5 - b) / W[2, ]
abline(as.array(intercept), as.array(slope), col = "red")

3.8 Understanding core Keras APIs

Layers

think of layers as the LEGO bricks of deep learning
building deep learning models in Keras is done by clipping together compatible layers to form useful data-transformation pipelines

# implement a dense layer
layer_simple_dense <- new_layer_class(
  classname = "SimpleDense",
  initialize = function(units, activation = NULL) {
    super$initialize()
    self$units <- as.integer(units)
    self$activation <- activation
  },

  # build(): to create weights
  build = function(input_shape) {
    input_dim <- input_shape[length(input_shape)]

    self$W <- self$add_weight(
      shape = c(input_dim, self$units),
      initializer = "random_normal"
    )

    self$b <- self$add_weight(
      shape = c(self$units),
      initializer = "zeros"
    )
  },

  # call(): to do computation
  call = function(inputs) {
    # affine transformation
    y <- tf$matmul(inputs, self$W) + self$b

    # activation
    if (!is.null(self$activation)) {
      y <- self$activation(y)
    }

    y
  }
)

Instantiate our layer

my_dense <- layer_simple_dense(
  units = 32,
  activation = tf$nn$relu
)

input_tensor <- as_tensor(1, shape = c(2, 784))

output_tensor <- my_dense(input_tensor)

output_tensor$shape

Automatic shape inference

units: the number of neurons in the layer. It determines the dimension of the output.

Recall that, each neuron (unit) will perform a affine transformation, $\sigma(W^T X + b)$

# the layer will transform the input data into a 32-dimensional vector
layer <- layer_dense(units = 32, activation = "relu")

No need to worry about size compatibility most of the time

model <- keras_model_sequential(layers = list(
  layer_dense(units = 32, activation = "relu"),
  layer_dense(units = 32)
))

Key takeaway:

No need to worry about input shape, it can be automatically inferred and ensure size compatibility
only need to provide units (output dimensionality)

Use the pipe to add layers

model <- keras_model_sequential() %>%
  layer_simple_dense(32, activation = "relu") %>%
  layer_simple_dense(64, activation = "relu") %>%
  layer_simple_dense(32, activation = "relu") %>%
  layer_simple_dense(10, activation = "softmax")