Image recognition (classification, segmentation, object detection)
Natural language processing (language models, translation, generation)
Domain modeling (embeddings)
Entity generation ("fakes"): Faces, animals, scenes...
Tasks involving tabular data (e.g., recommender systems)
Playing games (deep reinforcement learning)
A single numeric prediction
A single segmentation mask
A single translation
A single embedding
But wait ... aren't there probabilities in there, somewhere?
library(magick)# Attribution: Ben Tubby [CC BY 2.0 (https://creativecommons.org/licenses/by/2.0)]# https://upload.wikimedia.org/wikipedia/commons/3/30/Falkland_Islands_Penguins_35.jpgpath <- "penguin1.jpg" image_read(path) %>% image_resize("140")
library(tensorflow)library(keras)model <- application_mobilenet_v2()image <- # do some preprocessing ...probs <- model %>% predict(image)imagenet_decode_predictions(probs)
class_name class_description score1 n02056570 king_penguin 0.98995971682 n01847000 drake 0.00117939333 n01798484 prairie_chicken 0.00023872354 n02058221 albatross 0.00021172345 n02071294 killer_whale 0.0001432021
# Attribution: M. Murphy [Public domain]# https://upload.wikimedia.org/wikipedia/commons/2/22/RoyalPenguins3.JPGpath <- "penguin2.jpg" image_read(path) %>% image_resize("180")
probs <- model %>% predict(image)imagenet_decode_predictions(probs)
class_name class_description score1 n02051845 pelican 0.246140492 n02009912 American_egret 0.185641363 n02058221 albatross 0.068484994 n02012849 crane 0.045720015 n02009229 little_blue_heron 0.03902744
model <- keras_model_sequential() %>% layer_dense(units = 32, activation = "relu", input_shape = 7) %>% # default activation = linear layer_dense(units = 1)
model <- keras_model_sequential() %>% layer_dense(units = 32, activation = "relu", input_shape = 7) %>% layer_dense(units = 1, activation = "sigmoid")
model <- keras_model_sequential() %>% layer_dense(units = 32, activation = "relu", input_shape = 7) %>% layer_dense(units = 10, activation = "softmax")
Before and after softmax activation.
... how can we make this probabilistic?
TensorFlow Probability (Python library on top of TensorFlow)
tfprobability (R package)
devtools::install_github("rstudio/tfprobability")library(tfprobability)install_tfprobability()
d <- tfd_binomial(total_count = 7, probs = 0.3)d %>% tfd_mean()#> tf.Tensor(2.1000001, shape=(), dtype=float32)d %>% tfd_variance()#> tf.Tensor(1.47, shape=(), dtype=float32)d %>% tfd_log_prob(2.3)#> tf.Tensor(-1.1914139, shape=(), dtype=float32)
b <- tfb_affine_scalar(shift = 3.33, scale = 0.5)x <- c(100, 1000, 10000)b %>% tfb_forward(x)#> tf.Tensor([ 53.33 503.33 5003.33], shape=(3,), dtype=float32)
Keras layers
Markov Chain Monte Carlo (Hamiltonian Monte Carlo, NUTS)
Variational inference
State space models
GLMs
A network that has a multivariate normal distribution as output
model <- keras_model_sequential() %>% layer_dense(units = params_size_multivariate_normal_tri_l(d)) %>% layer_multivariate_normal_tri_l(event_size = d)log_loss <- function (y, model) - (model %>% tfd_log_prob(y))model %>% compile(optimizer = "adam", loss = log_loss)model %>% fit( x, y, batch_size = 100, epochs = 1, steps_per_epoch = 10)
Instead of a single unit (of a dense layer), we output a normal distribution:
model <- keras_model_sequential() %>% layer_dense(units = 8, activation = "relu") %>% layer_dense(units = 2, activation = "linear") %>% layer_distribution_lambda(function(x) tfd_normal(loc = x[, 1, drop = FALSE], scale = 1e-3 + tf$math$softplus(x[, 2, drop = FALSE]) ) )negloglik <- function(y, model) - (model %>% tfd_log_prob(y))model %>% compile( optimizer = optimizer_adam(lr = 0.01), loss = negloglik)model %>% fit(x, y, epochs = 1000)
yhat <- model(tf$constant(x_test))mean <- yhat %>% tfd_mean()sd <- yhat %>% tfd_stddev()
model <- keras_model_sequential() %>% layer_dense_variational( units = 1, make_posterior_fn = posterior_mean_field, make_prior_fn = prior_trainable, kl_weight = 1 / n ) %>% layer_distribution_lambda( function(x) tfd_normal(loc = x, scale = 1) )negloglik <- function(y, model) - (model %>% tfd_log_prob(y))model %>% compile( optimizer = optimizer_adam(lr = 0.1), loss = negloglik)model %>% fit(x, y, epochs = 1000)
Every prediction uses a different sample from the weight distributions!
yhats <- purrr::map(1:100, function(x) model(tf$constant(x_test)))
model <- keras_model_sequential() %>% layer_dense_variational( units = 2, ) %>% layer_distribution_lambda(function(x) tfd_normal(loc = x[, 1, drop = FALSE], scale = 1e-3 + tf$math$softplus(0.01 * x[, 2, drop = FALSE]) ) )yhats <- purrr::map(1:100, function(x) model(tf$constant(x_test)))means <- purrr::map(yhats, purrr::compose(as.matrix, tfd_mean)) %>% abind::abind()sds <- purrr::map(yhats, purrr::compose(as.matrix, tfd_stddev)) %>% abind::abind()
Each line is one draw from the posterior weights; each line has its own standard deviation.
More background: https://blogs.rstudio.com/tensorflow/posts/2019-06-05-uncertainty-estimates-tfprobability/
encoder_model <- keras_model_sequential() %>% [...] %>% layer_multivariate_normal_tri_l(event_size = encoded_size) %>% # pass in the prior of your choice ... # can use exact KL divergence or Monte Carlo approximation layer_kl_divergence_add_loss([...])decoder_model <- keras_model_sequential() %>% [...] %>% layer_independent_bernoulli([...])vae_model <- keras_model(inputs = encoder_model$inputs, outputs = decoder_model(encoder_model$outputs[1]))vae_loss <- function (x, rv_x) - (rv_x %>% tfd_log_prob(x))
mcmc_
- kernels available to the user:https://blogs.rstudio.com/tensorflow/posts/2019-05-06-tadpoles-on-tensorflow/
Define a joint probability distribution:
m2 <- tfd_joint_distribution_sequential( list( tfd_normal(loc = 0, scale = 1.5), tfd_exponential(rate = 1), function(sigma, a_bar) tfd_sample_distribution( tfd_normal(loc = a_bar, scale = sigma), sample_shape = list(n_tadpole_tanks) ), function(l) tfd_independent( tfd_binomial(total_count = n_start, logits = l), reinterpreted_batch_ndims = 1 )))
Define optimization target (loss) and kernel (algorithm):
logprob <- function(a, s, l) m2 %>% tfd_log_prob(list(a, s, l, n_surviving))hmc <- mcmc_hamiltonian_monte_carlo( target_log_prob_fn = logprob, num_leapfrog_steps = 3, step_size = 0.1,) %>% mcmc_simple_step_size_adaptation( target_accept_prob = 0.8, num_adaptation_steps = n_burnin )
Get starting values and sample:
c(initial_a, initial_s, initial_logits, .) %<-% (m2 %>% tfd_sample(n_chain))run_mcmc <- function(kernel) { kernel %>% mcmc_sample_chain( num_results = n_steps, num_burnin_steps = n_burnin, current_state = list(initial_a, tf$ones_like(initial_s), initial_logits) )res <- hmc %>% run_mcmc()
Use variational inference or (Hamiltonian) Monte Carlo to
decompose
filter (as in: Kálmán filter)
smooth
forecast
dynamic linear models.
Dynamic regression example: https://blogs.rstudio.com/tensorflow/posts/2019-06-25-dynamic_linear_models_tfprobability/
tfprobability (TensorFlow Probability) - your toolbox for "everything Bayesian"
Lots of ongoing development on the TFP side - stay tuned for cool additions :-)
Follow the blog for applications and background: https://blogs.rstudio.com/tensorflow/
Depending on your needs, pick what's most useful to you:
Thanks for listening!
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |