Deep Learning with R – Chapter 3
knitr::opts_chunk$set(
warning = FALSE,
message = FALSE,
echo = TRUE,
eval = FALSE,
dpi = 300,
tidy = "styler",
fig.width = 8,
fig.height = 5
)Packages
Find out the TensorFlow version on the machine.
3.6 TensorFlow tensors
r_array <- array(1:6, c(2, 3))
tf_tensor <- as_tensor(r_array)
tf_tensor
dim(tf_tensor)
length(tf_tensor)rank (ndim) = how many axis
shape = how many elements in each axis
type (dtype) = the range of values and the memory footprint
We can often write the same code for TensorFlow tensors as R arrays.
3.7 Tensor attributes
ndim = rank of the tensor = number of axis
# scaler, no axis, rank = 0
as_tensor(1)$ndim
# vector, 1 axis, rank = 1
as_tensor(1:10)$ndim
# matrix, 2 axis, rank = 2
tf_tensor$ndimshape = how many elements in each axis
TensorShape([2, 3]) means: 2 elements in first axis, 3 elements in second axis
In R, the default floating numeric datatype, double, is
coverted to tf.float64
r_array <- array(1)
r_array
typeof(r_array)
as_tensor(r_array)$dtype
# in the book, we use `float32` as default floating point datatype
as_tensor(r_array, dtype = "float32")Tensor shape and reshaping
# make an array of 0
as_tensor(0, shape = c(2, 3))
# make R vector to tensor
as_tensor(1:6, shape = c(2, 3))tensor vs. R-array:
TensorFlow tensors use: row-major ordering = C ordering
R arrays use: column-major ordering = Fortran ordering
# explicit about reshaping behavior
# C ordering
array_reshape(1:6, dim = c(2, 3), order = "C")
# Fortran ordering
array_reshape(1:6, dim = c(2, 3), order = "F")array_reshape() and as_tensor() can
automatically infer size of unspecified axes. One can use
-1 or NA for the unspecified axis
Tensor slicing
Tensors allow us to get “the rest of the tensor in that direction”
Tensor broadcasting
Broadcasting is performed when we have an operation on two different-sized tensors, and we want the smaller tensor to be broadcast to match the shape of the larger tensor.
The tf module
How to create tensors?
We can use as_tensor to create tensors. Other ways?
library(tensorflow)
# all-ones tensor
tf$ones(shape(1, 3))
# all-zero tensor
tf$zeros(shape(1, 3))
# tensors of Gaussian random variable
tf$random$normal(shape(1, 3), mean = 0, stddev = 1)Be careful of R syntax
tf module uses a 0-based index R uses a 1-based index
Constant tensors and variables
Note: tensors are not modifiable! They are constant!
# in R, array is modifiable
x <- array(1, dim = c(2, 2))
x[1, 1] <- 0
x
# tensor is not modifiable
x <- as_tensor(1, shape = c(2, 2))
x[1, 1] <- 0 # errorIf we need to update/change states of tensor, we need variables
# create a TensorFlow variable
v <- tf$Variable(initial_value = tf$random$normal(shape(3, 1)))
v
# assign value to a variable
v$assign(tf$ones(shape(3, 1)))
v
# assign_add() means x <- x + value
v$assign_add(tf$ones(shape(3, 1)))
# assign_sub() means x <- x - value
v$assign_sub(tf$ones(shape(3, 1)))Tensor operations: Doing math in TensorFlow
GraidentTape API
R cannot retrieve the gradient of the differentiable expression w.r.t. any of its input.
# using the GradientTape
input_var <- tf$Variable(initial_value = 3)
with(tf$GradientTape() %as% tape, {
result <- tf$square(input_var)
})
gradient <- tape$gradient(result, input_var)Why care?
This is most commonly used to retrieve the gradients of the loss of a model w.r.t. its weights
Note: only trainable variables are tracked by default
With a constant tensor, we need to manually mark it as being tracked
by calling tape$watch()
# using GradientTape with constant tensor inputs
input_const <- as_tensor(3)
with(tf$GradientTape() %as% tape, {
tape$watch(input_const) # manually mark it as being tracked
result <- tf$square(input_const)
})
gradient <- tape$gradient(result, input_const)
gradientCompute second-order gradients
# using nested gradient tapes to compute second-order gradients
time <- tf$Variable(0)
with(tf$GradientTape() %as% outer_tape, {
with(tf$GradientTape() %as% inner_tape, {
position <- 4.9 * time^2
})
speed <- inner_tape$gradient(position, time)
})
acceleration <- outer_tape$gradient(speed, time)
accelerationExample: A linear classifier in pure TensorFlow
Generate two class of points by drawing their coordinates from a random distribution with a covariance matrix and a mean
The covariance matrix describes the shape of the point cloud, and the mean describes its position in the plane
# generating two classes of points
num_samples_per_class <- 1000
# covariance matrix
Sigma <- rbind(
c(1, 0.5),
c(0.5, 1)
)
negative_samples <- MASS::mvrnorm(
n = num_samples_per_class,
mu = c(0, 3),
Sigma = Sigma
)
positive_samples <- MASS::mvrnorm(
n = num_samples_per_class,
mu = c(3, 0),
Sigma = Sigma
)
# stacking them into an array
inputs <- rbind(negative_samples, positive_samples)
# generating the corresponding targets (0, 1)
targets <- rbind(
array(0, dim = c(num_samples_per_class, 1)),
array(1, dim = c(num_samples_per_class, 1))
)plot the data
Create a linear classifier to separate these two classes of points:
- linear classifier is a affine transformation, \(\text{prediction} = W \cdot \text{input} + b\), trained to minimize the square different between predictions and targets
# creating the linear classifier variables
# 1. specify input dim: inputs will be 2D points
input_dim <- 2
# 2. specify output dim: output predictions will be a single score per sample
output_dim <- 1
# 3. create W and b and initialize
W <- tf$Variable(
initial_value = tf$random$uniform(shape(input_dim, output_dim))
)
W
b <- tf$Variable(
initial_value = tf$zeros(shape(output_dim))
)
bSpecify model and loss function
# the forward pass function
model <- function(inputs) {
tf$matmul(inputs, W) + b
}
# the mean square loss function
# R style
square_loss <- function(targets, predictions) {
per_sample_losses <- (targets - predictions)^2
mean(per_sample_losses)
}
# tf style
square_loss <- function(targets, predictions) {
per_sample_losses <- tf$square(tf$subtract(targets, predictions))
tf$reduce_mean(per_sample_losses)
}training
# the training step function
learning_rate <- 0.1
# train step contains:
# 1. gradient computation
# 2. weight update
training_step <- function(inputs, targets) {
# forward pass, inside a gradient tape scope
with(tf$GradientTape() %as% tape, {
predictions <- model(inputs)
loss <- square_loss(predictions, targets)
})
# retrieve the gradient of the loss with regard to weights
grad_loss_wrt <- tape$gradient(loss, list(W = W, b = b))
# update weights
W$assign_sub(grad_loss_wrt$W * learning_rate)
b$assign_sub(grad_loss_wrt$b * learning_rate)
loss
}Above is batch training instead of mini-batch training
- For batch training, in each train step (i.e., compute gradient and update the weights), we will use ALL data, rather than using small batches of data
# the batch training loop
inputs <- as_tensor(inputs, dtype = "float32")
# using 40 steps
for (step in seq(40)) {
loss <- training_step(inputs, targets)
cat(sprintf("Loss at step %s: %.4f\n", step, loss))
}predictions <- model(inputs)
# convert tensors to R arrays for plotting
inputs <- as.array(inputs)
predictions <- as.array(predictions)
plot(
inputs[, 1],
inputs[, 2],
col = ifelse(predictions[, 1] <= 0.5, "purple", "green")
)
# plot the separating line
slope <- -W[1, ] / W[2, ]
intercept <- (.5 - b) / W[2, ]
abline(as.array(intercept), as.array(slope), col = "red")3.8 Understanding core Keras APIs
Layers
think of layers as the LEGO bricks of deep learning
building deep learning models in Keras is done by clipping together compatible layers to form useful data-transformation pipelines
# implement a dense layer
layer_simple_dense <- new_layer_class(
classname = "SimpleDense",
initialize = function(units, activation = NULL) {
super$initialize()
self$units <- as.integer(units)
self$activation <- activation
},
# build(): to create weights
build = function(input_shape) {
input_dim <- input_shape[length(input_shape)]
self$W <- self$add_weight(
shape = c(input_dim, self$units),
initializer = "random_normal"
)
self$b <- self$add_weight(
shape = c(self$units),
initializer = "zeros"
)
},
# call(): to do computation
call = function(inputs) {
# affine transformation
y <- tf$matmul(inputs, self$W) + self$b
# activation
if (!is.null(self$activation)) {
y <- self$activation(y)
}
y
}
)Instantiate our layer
my_dense <- layer_simple_dense(
units = 32,
activation = tf$nn$relu
)
input_tensor <- as_tensor(1, shape = c(2, 784))
output_tensor <- my_dense(input_tensor)
output_tensor$shapeAutomatic shape inference
units: the number of neurons in the layer. It determines
the dimension of the output.
Recall that, each neuron (unit) will perform a affine transformation, \(\sigma(W^T X + b)\)
# the layer will transform the input data into a 32-dimensional vector
layer <- layer_dense(units = 32, activation = "relu")No need to worry about size compatibility most of the time
model <- keras_model_sequential(layers = list(
layer_dense(units = 32, activation = "relu"),
layer_dense(units = 32)
))Key takeaway:
No need to worry about input shape, it can be automatically inferred and ensure size compatibility
only need to provide
units(output dimensionality)
Use the pipe to add layers