Deep Learning with R – Chapter 3

knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE,
  echo = TRUE, 
  eval = FALSE,
  dpi = 300, 
  tidy = "styler", 
  fig.width = 8, 
  fig.height = 5
)

Packages

library(keras3)
library(tensorflow)

Find out the TensorFlow version on the machine.

tensorflow::tf_config()

3.6 TensorFlow tensors

r_array <- array(1:6, c(2, 3))

tf_tensor <- as_tensor(r_array)

tf_tensor

dim(tf_tensor)
length(tf_tensor)
  • rank (ndim) = how many axis

  • shape = how many elements in each axis

  • type (dtype) = the range of values and the memory footprint

# built-in math
tf_tensor + tf_tensor

We can often write the same code for TensorFlow tensors as R arrays.

3.7 Tensor attributes

tf_tensor$ndim

length(dim(tf_tensor)) == tf_tensor$ndim

ndim = rank of the tensor = number of axis

# scaler, no axis, rank = 0
as_tensor(1)$ndim

# vector, 1 axis, rank = 1
as_tensor(1:10)$ndim

# matrix, 2 axis, rank = 2
tf_tensor$ndim
tf_tensor$shape
  • shape = how many elements in each axis

  • TensorShape([2, 3]) means: 2 elements in first axis, 3 elements in second axis

# create TensorShape object with shape() function
shape(2, 3)
# dtype: data type
tf_tensor$dtype

In R, the default floating numeric datatype, double, is coverted to tf.float64

r_array <- array(1)
r_array
typeof(r_array)

as_tensor(r_array)$dtype

# in the book, we use `float32` as default floating point datatype
as_tensor(r_array, dtype = "float32")

Tensor shape and reshaping

# make an array of 0
as_tensor(0, shape = c(2, 3))

# make R vector to tensor
as_tensor(1:6, shape = c(2, 3))

tensor vs. R-array:

  • TensorFlow tensors use: row-major ordering = C ordering

  • R arrays use: column-major ordering = Fortran ordering

# explicit about reshaping behavior

# C ordering
array_reshape(1:6, dim = c(2, 3), order = "C")

# Fortran ordering
array_reshape(1:6, dim = c(2, 3), order = "F")

array_reshape() and as_tensor() can automatically infer size of unspecified axes. One can use -1 or NA for the unspecified axis

array_reshape(1:6, c(-1, 2))
array_reshape(1:6, c(NA, 2))

as_tensor(1:6, shape = c(2, NA))
as_tensor(1:6, shape = c(2, -1))

Tensor slicing

Tensors allow us to get “the rest of the tensor in that direction”

train_images <- as_tensor(dataset_mnist()$train$x)
train_images$shape

my_slice <- train_images[, 15:NA, 15:NA]
my_slice <- train_images[, 8:-8, 8:-8]
my_slice <- train_images[1:100, all_dims()]

Tensor broadcasting

Broadcasting is performed when we have an operation on two different-sized tensors, and we want the smaller tensor to be broadcast to match the shape of the larger tensor.

x <- as_tensor(1, shape = c(64, 3, 32, 10))
y <- as_tensor(2, shape = c(32, 10))
z <- x + y
z

The tf module

How to create tensors?

We can use as_tensor to create tensors. Other ways?

library(tensorflow)

# all-ones tensor
tf$ones(shape(1, 3))

# all-zero tensor
tf$zeros(shape(1, 3))

# tensors of Gaussian random variable
tf$random$normal(shape(1, 3), mean = 0, stddev = 1)

Be careful of R syntax

tf$ones(c(2, 1)) # wrong

tf$ones(c(2L, 1L)) # right way

tf module uses a 0-based index R uses a 1-based index

m <- as_tensor(1:12, shape = c(3, 4))
m

# take column mean, tf uses 0-based index
tf$reduce_mean(m, axis = 0L, keepdims = TRUE)

# take column mean, R uses 1-based index
mean(m, axis = 1, keepdims = TRUE)

Constant tensors and variables

Note: tensors are not modifiable! They are constant!

# in R, array is modifiable
x <- array(1, dim = c(2, 2))
x[1, 1] <- 0
x

# tensor is not modifiable
x <- as_tensor(1, shape = c(2, 2))
x[1, 1] <- 0 # error

If we need to update/change states of tensor, we need variables

# create a TensorFlow variable
v <- tf$Variable(initial_value = tf$random$normal(shape(3, 1)))
v

# assign value to a variable
v$assign(tf$ones(shape(3, 1)))
v

# assign_add() means x <- x + value
v$assign_add(tf$ones(shape(3, 1)))

# assign_sub() means x <- x - value
v$assign_sub(tf$ones(shape(3, 1)))

Tensor operations: Doing math in TensorFlow

a <- tf$ones(c(2L, 2L))
a

a <- as_tensor(c(4, 4, 4, 4), shape = c(2, 2))
a

b <- tf$square(a)
b

c <- tf$sqrt(a)
c

d <- b + c # add two tensors (element-wise)
d

e <- tf$matmul(a, b) # tensor product, or dot product
e

e <- e * d # multiple two tensors (element-wise)
e

GraidentTape API

R cannot retrieve the gradient of the differentiable expression w.r.t. any of its input.

# using the GradientTape
input_var <- tf$Variable(initial_value = 3)

with(tf$GradientTape() %as% tape, {
  result <- tf$square(input_var)
})

gradient <- tape$gradient(result, input_var)

Why care?

This is most commonly used to retrieve the gradients of the loss of a model w.r.t. its weights

# gradient <- tape$gradient(loss, weights)

Note: only trainable variables are tracked by default

With a constant tensor, we need to manually mark it as being tracked by calling tape$watch()

# using GradientTape with constant tensor inputs

input_const <- as_tensor(3)

with(tf$GradientTape() %as% tape, {
  tape$watch(input_const) # manually mark it as being tracked
  result <- tf$square(input_const)
})

gradient <- tape$gradient(result, input_const)

gradient

Compute second-order gradients

# using nested gradient tapes to compute second-order gradients

time <- tf$Variable(0)

with(tf$GradientTape() %as% outer_tape, {
  with(tf$GradientTape() %as% inner_tape, {
    position <- 4.9 * time^2
  })
  speed <- inner_tape$gradient(position, time)
})

acceleration <- outer_tape$gradient(speed, time)

acceleration

Example: A linear classifier in pure TensorFlow

  • Generate two class of points by drawing their coordinates from a random distribution with a covariance matrix and a mean

  • The covariance matrix describes the shape of the point cloud, and the mean describes its position in the plane

# generating two classes of points
num_samples_per_class <- 1000

# covariance matrix
Sigma <- rbind(
  c(1, 0.5),
  c(0.5, 1)
)

negative_samples <- MASS::mvrnorm(
  n = num_samples_per_class,
  mu = c(0, 3),
  Sigma = Sigma
)

positive_samples <- MASS::mvrnorm(
  n = num_samples_per_class,
  mu = c(3, 0),
  Sigma = Sigma
)

# stacking them into an array
inputs <- rbind(negative_samples, positive_samples)

# generating the corresponding targets (0, 1)
targets <- rbind(
  array(0, dim = c(num_samples_per_class, 1)),
  array(1, dim = c(num_samples_per_class, 1))
)

plot the data

plot(
  x = inputs[, 1],
  y = inputs[, 2],
  col = ifelse(targets[, 1] == 0, "purple", "green")
)

Create a linear classifier to separate these two classes of points:

  • linear classifier is a affine transformation, \(\text{prediction} = W \cdot \text{input} + b\), trained to minimize the square different between predictions and targets
# creating the linear classifier variables

# 1. specify input dim: inputs will be 2D points
input_dim <- 2

# 2. specify output dim: output predictions will be a single score per sample
output_dim <- 1

# 3. create W and b and initialize
W <- tf$Variable(
  initial_value = tf$random$uniform(shape(input_dim, output_dim))
)

W

b <- tf$Variable(
  initial_value = tf$zeros(shape(output_dim))
)

b

Specify model and loss function

# the forward pass function
model <- function(inputs) {
  tf$matmul(inputs, W) + b
}

# the mean square loss function

# R style
square_loss <- function(targets, predictions) {
  per_sample_losses <- (targets - predictions)^2
  mean(per_sample_losses)
}

# tf style
square_loss <- function(targets, predictions) {
  per_sample_losses <- tf$square(tf$subtract(targets, predictions))
  tf$reduce_mean(per_sample_losses)
}

training

# the training step function
learning_rate <- 0.1

# train step contains:
# 1. gradient computation
# 2. weight update

training_step <- function(inputs, targets) {
  # forward pass, inside a gradient tape scope
  with(tf$GradientTape() %as% tape, {
    predictions <- model(inputs)
    loss <- square_loss(predictions, targets)
  })

  # retrieve the gradient of the loss with regard to weights
  grad_loss_wrt <- tape$gradient(loss, list(W = W, b = b))

  # update weights
  W$assign_sub(grad_loss_wrt$W * learning_rate)
  b$assign_sub(grad_loss_wrt$b * learning_rate)
  loss
}

Above is batch training instead of mini-batch training

  • For batch training, in each train step (i.e., compute gradient and update the weights), we will use ALL data, rather than using small batches of data
# the batch training loop

inputs <- as_tensor(inputs, dtype = "float32")

# using 40 steps
for (step in seq(40)) {
  loss <- training_step(inputs, targets)
  cat(sprintf("Loss at step %s: %.4f\n", step, loss))
}
predictions <- model(inputs)

# convert tensors to R arrays for plotting
inputs <- as.array(inputs)
predictions <- as.array(predictions)

plot(
  inputs[, 1],
  inputs[, 2],
  col = ifelse(predictions[, 1] <= 0.5, "purple", "green")
)

# plot the separating line
slope <- -W[1, ] / W[2, ]
intercept <- (.5 - b) / W[2, ]
abline(as.array(intercept), as.array(slope), col = "red")

3.8 Understanding core Keras APIs

Layers

  • think of layers as the LEGO bricks of deep learning

  • building deep learning models in Keras is done by clipping together compatible layers to form useful data-transformation pipelines

# implement a dense layer
layer_simple_dense <- new_layer_class(
  classname = "SimpleDense",
  initialize = function(units, activation = NULL) {
    super$initialize()
    self$units <- as.integer(units)
    self$activation <- activation
  },

  # build(): to create weights
  build = function(input_shape) {
    input_dim <- input_shape[length(input_shape)]

    self$W <- self$add_weight(
      shape = c(input_dim, self$units),
      initializer = "random_normal"
    )

    self$b <- self$add_weight(
      shape = c(self$units),
      initializer = "zeros"
    )
  },

  # call(): to do computation
  call = function(inputs) {
    # affine transformation
    y <- tf$matmul(inputs, self$W) + self$b

    # activation
    if (!is.null(self$activation)) {
      y <- self$activation(y)
    }

    y
  }
)

Instantiate our layer

my_dense <- layer_simple_dense(
  units = 32,
  activation = tf$nn$relu
)

input_tensor <- as_tensor(1, shape = c(2, 784))

output_tensor <- my_dense(input_tensor)

output_tensor$shape

Automatic shape inference

units: the number of neurons in the layer. It determines the dimension of the output.

Recall that, each neuron (unit) will perform a affine transformation, \(\sigma(W^T X + b)\)

# the layer will transform the input data into a 32-dimensional vector
layer <- layer_dense(units = 32, activation = "relu")

No need to worry about size compatibility most of the time

model <- keras_model_sequential(layers = list(
  layer_dense(units = 32, activation = "relu"),
  layer_dense(units = 32)
))

Key takeaway:

  • No need to worry about input shape, it can be automatically inferred and ensure size compatibility

  • only need to provide units (output dimensionality)

Use the pipe to add layers

model <- keras_model_sequential() %>%
  layer_simple_dense(32, activation = "relu") %>%
  layer_simple_dense(64, activation = "relu") %>%
  layer_simple_dense(32, activation = "relu") %>%
  layer_simple_dense(10, activation = "softmax")

From layers to models