import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

print('TF version:', tf.__version__)
print('TFP version:', tfp.__version__)

TF version: 2.3.0
TFP version: 0.11.0


# Plot the approximate density contours of a 2d spherical Gaussian

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

spherical_2d_gaussian = tfd.MultivariateNormalDiag(loc = [0., 0.])

N = 100000
x = spherical_2d_gaussian.sample(N)
x1 = x[:, 0]
x2 = x[:, 1]
sns.jointplot(x1, x2, kind = 'kde', space = 0)

/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

<seaborn.axisgrid.JointGrid at 0x7f55a5889210>


# Set the mean and covariance parameters
mu = [0., 0.]  # mean
scale_tril = [[1.,  0.], [0.6, 0.8]]

sigma = tf.matmul(tf.constant(scale_tril), tf.transpose(tf.constant(scale_tril)))  # covariance matrix
print(sigma)

tf.Tensor(
[[1.  0.6]
 [0.6 1. ]], shape=(2, 2), dtype=float32)


# Create the 2D Gaussian with full covariance
nonspherical_2d_gaussian = tfd.MultivariateNormalTriL(loc = mu, scale_tril = scale_tril)
nonspherical_2d_gaussian

<tfp.distributions.MultivariateNormalTriL 'MultivariateNormalTriL' batch_shape=[] event_shape=[2] dtype=float32>


# Check the Distribution mean
nonspherical_2d_gaussian.mean()

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 0.], dtype=float32)>


# Check the Distribution covariance
nonspherical_2d_gaussian.covariance()

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1. , 0.6],
       [0.6, 1. ]], dtype=float32)>


# Plot its approximate density contours
x = nonspherical_2d_gaussian.sample(N)
x1 = x[:, 0]
x2 = x[:, 1]
sns.jointplot(x1, x2, kind = 'kde', space = 0, color = 'r')

/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

<seaborn.axisgrid.JointGrid at 0x7f55341cdc50>


# Define a symmetric positive-definite matrix
sigma = [[10., 5.], [5., 10.]]


# Compute the lower triangular matrix L from the Cholesky decomposition
scale_tril = tf.linalg.cholesky(sigma)
scale_tril

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[3.1622777, 0.       ],
       [1.5811388, 2.7386127]], dtype=float32)>


# Check that LL^T = Sigma
tf.linalg.matmul(scale_tril, tf.transpose(scale_tril))

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[10.      ,  5.      ],
       [ 5.      ,  9.999999]], dtype=float32)>


# Try to compute the Cholesky decomposition for a matrix with negative eigenvalues
bad_sigma = [[10., 11.], [11., 10.]]

try:
    scale_tril = tf.linalg.cholesky(bad_sigma)
except Exception as e:
    print(e)

Cholesky decomposition was not successful. The input might not be valid. [Op:Cholesky]


# Create a multivariate Gaussian with a positive semi-definite covariance matrix
psd_mvn = tfd.MultivariateNormalTriL(loc = [0., 0.], scale_tril = [[1., 0.], [0.4, 0.]])
psd_mvn

<tfp.distributions.MultivariateNormalTriL 'MultivariateNormalTriL' batch_shape=[] event_shape=[2] dtype=float32>


# Plot samples from this distribution
x = psd_mvn.sample(N)
x1 = x[:, 0]
x2 = x[:, 1]
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.title('Scatter plot of samples')
plt.scatter(x1, x2, alpha = 0.5)

<matplotlib.collections.PathCollection at 0x7f55a5704e50>


# Try to compute the Cholesky decomposition for a positive semi-definite matrix
another_bad_sigma = [[10., 0.], [0., 0.]]

try:
    scale_tril = tf.linalg.cholesky(another_bad_sigma)
except Exception as e:
    print(e)

Cholesky decomposition was not successful. The input might not be valid. [Op:Cholesky]


# Create a multivariate Gaussian distribution
mu = [1., 2., 3.]
sigma = [
    [0.5, 0.1, 0.1],
    [0.1,  1., 0.6],
    [0.1, 0.6, 2.]
]

scale_tril = tf.linalg.cholesky(sigma)
multivariate_normal = tfd.MultivariateNormalTriL(loc = mu, scale_tril = scale_tril)


# Check the covariance matrix
multivariate_normal.covariance()

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0.4999999 , 0.09999999, 0.09999999],
       [0.09999999, 1.0000001 , 0.6000001 ],
       [0.09999999, 0.6000001 , 2.0000002 ]], dtype=float32)>


# Check the mean
multivariate_normal.mean()

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([1., 2., 3.], dtype=float32)>


import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

print('TF version:', tf.__version__)
print('TFP version:', tfp.__version__)

TF version: 2.3.0
TFP version: 0.11.0


import numpy as np


# Add two arrays with different shapes
a = np.array([[1.], [2.], [3.], [4.]])  # shape (4, 1)
b = np.array([0., 1., 2.])  # shape (3,) 
a + b

array([[1., 2., 3.],
       [2., 3., 4.],
       [3., 4., 5.],
       [4., 5., 6.]])


# Multiply two arrays with different shapes
a = np.array([[[0.01], [0.1]], [[1.00], [10.]]]) # shape (2, 2, 1)
b = np.array([[[2., 2.]], [[3., 3.]]]) # shape (2, 1, 2)
a * b # shape (2, 2, 2)

array([[[2.e-02, 2.e-02],
        [2.e-01, 2.e-01]],

       [[3.e+00, 3.e+00],
        [3.e+01, 3.e+01]]])


# Use broadcasting to compute an outer product
a = np.array([-1., 0., 1.])
b = np.array([0., 1., 2., 3.])
a[:, np.newaxis] * b  # outer product ab^T, where a and b are column vectors

array([[-0., -1., -2., -3.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  1.,  2.,  3.]])


# Define three arrays with different shapes
a = [[1.], [2.], [3.]]
b = np.zeros(shape = [10, 1, 1])
c = np.ones(shape = [4])


# Predict the shape before executing this cell
(a + b).shape

(10, 3, 1)


# Predict the shape before executing this cell
(a * c).shape

(3, 4)


# Predict the shape before executing this cell
(a * b + c).shape

(10, 3, 4)


# Define a batch of Normal distributions without broadcasting
batch_of_normals = tfd.Normal(loc = [0., 0., 0., 1., 1., 1.], scale = [1., 10., 100., 1., 10., 100.])


# Print the distribution and notice the batch and event shapes
batch_of_normals

<tfp.distributions.Normal 'Normal' batch_shape=[6] event_shape=[] dtype=float32>


# Check the parameter values for loc
batch_of_normals.loc

<tf.Tensor: shape=(6,), dtype=float32, numpy=array([0., 0., 0., 1., 1., 1.], dtype=float32)>


# Check the parameter values for scale
batch_of_normals.scale

<tf.Tensor: shape=(6,), dtype=float32, numpy=array([  1.,  10., 100.,   1.,  10., 100.], dtype=float32)>


# Define a batch of Normal distributions with broadcasting
loc = [[0.], [1.]]
scale = [1., 10., 100.]
another_batch_of_normals = tfd.Normal(loc = loc, scale = scale)


# Print the distribution and notice the batch and event shapes
another_batch_of_normals

<tfp.distributions.Normal 'Normal' batch_shape=[2, 3] event_shape=[] dtype=float32>


# The stored loc parameter values are what you pass in, not what is used after broadcasting
another_batch_of_normals.loc

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.],
       [1.]], dtype=float32)>


# The stored scale parameter values are what you pass in, not what is used after broadcasting
another_batch_of_normals.scale

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([  1.,  10., 100.], dtype=float32)>


# Define a batch of Normal distributions with broadcasting
loc = [[0.], [10.]]
scale = [1., 1., 1.]
another_batch_of_normals = tfd.Normal(loc = loc, scale = scale)
another_batch_of_normals

<tfp.distributions.Normal 'Normal' batch_shape=[2, 3] event_shape=[] dtype=float32>


# Use broadcasting along the second axis with the prob method
sample = tf.random.uniform((2, 1))
another_batch_of_normals.prob(sample)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[3.6183167e-01, 3.6183167e-01, 3.6183167e-01],
       [4.8273273e-19, 4.8273273e-19, 4.8273273e-19]], dtype=float32)>


# Use broadcasting along the first axis with the prob method
sample = tf.random.uniform((1, 3))
another_batch_of_normals.prob(sample)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[3.4341156e-01, 3.7987795e-01, 3.7401137e-01],
       [1.5809656e-20, 1.6750055e-21, 2.6204080e-21]], dtype=float32)>


# Use broadcasting along both axes with the prob method
sample = tf.random.uniform((1, 1))
another_batch_of_normals.prob(sample)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[3.6780354e-01, 3.6780354e-01, 3.6780354e-01],
       [3.9974444e-21, 3.9974444e-21, 3.9974444e-21]], dtype=float32)>


# Use broadcasting along the first axis with the log_prob method
sample = tf.random.uniform((1, 3))
another_batch_of_normals.log_prob(sample)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ -0.95117575,  -0.9875492 ,  -0.96976864],
       [-48.411987  , -47.28321   , -47.78135   ]], dtype=float32)>


# Define a multivariate Gaussian distribution without broadcasting
single_mvt_normal = tfd.MultivariateNormalDiag(loc = [0., 0.], scale_diag = [1., 0.5])
single_mvt_normal

<tfp.distributions.MultivariateNormalDiag 'MultivariateNormalDiag' batch_shape=[] event_shape=[2] dtype=float32>


# Print the loc parameter
single_mvt_normal.loc

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 0.], dtype=float32)>


# Print the covariance matrix - the diagonal is scale_diag^2
single_mvt_normal.covariance()

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1.  , 0.  ],
       [0.  , 0.25]], dtype=float32)>


# Define a multivariate Gaussian distribution with broadcasting
loc = [[0., 0.], [1., 1.]]
scale_diag = [1., 0.5]
batch_of_mvt_normals = tfd.MultivariateNormalDiag(loc = loc, scale_diag = scale_diag)


# Print the distribution - note the event_shape and batch_shape
batch_of_mvt_normals

<tfp.distributions.MultivariateNormalDiag 'MultivariateNormalDiag' batch_shape=[2] event_shape=[2] dtype=float32>


# Print the distribution parameters
# There is a batch of two distributions with different means and same covariance
batch_of_mvt_normals.parameters

{'loc': ListWrapper([ListWrapper([0.0, 0.0]), ListWrapper([1.0, 1.0])]),
 'scale_diag': ListWrapper([1.0, 0.5]),
 'scale_identity_multiplier': None,
 'validate_args': False,
 'allow_nan_stats': True,
 'name': 'MultivariateNormalDiag'}


# Define a multivariate Gaussian distribution with broadcasting
loc = [[1.,  1.,  1.], [-1., -1., -1.]] # shape (2, 3)
scale_diag = [[[0.1, 0.1, 0.1]], [[10., 10., 10.]]] # shape (2, 1, 3)
another_batch_of_mvt_normals = tfd.MultivariateNormalDiag(loc = loc, scale_diag = scale_diag)


# Print the distribution and note batch and event shapes - bingo!
another_batch_of_mvt_normals

<tfp.distributions.MultivariateNormalDiag 'MultivariateNormalDiag' batch_shape=[2, 2] event_shape=[3] dtype=float32>


# Print the distribution parameters
another_batch_of_mvt_normals.parameters

{'loc': ListWrapper([ListWrapper([1.0, 1.0, 1.0]), ListWrapper([-1.0, -1.0, -1.0])]),
 'scale_diag': ListWrapper([ListWrapper([ListWrapper([0.1, 0.1, 0.1])]), ListWrapper([ListWrapper([10.0, 10.0, 10.0])])]),
 'scale_identity_multiplier': None,
 'validate_args': False,
 'allow_nan_stats': True,
 'name': 'MultivariateNormalDiag'}


# Define a batch of Normal distributions with broadcasting
loc = [[0.], [1.], [0.]]
scale = [1., 10., 100., 1., 10, 100.]
another_batch_of_normals = tfd.Normal(loc = loc, scale = scale)
another_batch_of_normals

<tfp.distributions.Normal 'Normal' batch_shape=[3, 6] event_shape=[] dtype=float32>


# Create a multivariate Independent distribution
another_batch_of_mvt_normals = tfd.Independent(another_batch_of_normals)
another_batch_of_mvt_normals

<tfp.distributions.Independent 'IndependentNormal' batch_shape=[3] event_shape=[6] dtype=float32>


# Use broadcasting with the prob method
# B shaped input (broadcast over event)
sample = tf.random.uniform((3, 1))
another_batch_of_mvt_normals.prob(sample)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([1.5887796e-09, 3.7002286e-09, 2.5683733e-09], dtype=float32)>


# Use broadcasting with the prob method
# E shaped input (broadcast over batch)
sample = tf.random.uniform((1, 6))
another_batch_of_mvt_normals.prob(sample)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([2.850026e-09, 2.595904e-09, 2.850026e-09], dtype=float32)>


# Use broadcasting with the prob method
# [S,B,E] shaped input (broadcast over samples)
sample = tf.random.uniform((2, 3, 6))
another_batch_of_mvt_normals.prob(sample)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[3.3494645e-09, 2.0184825e-09, 2.4749744e-09],
       [3.0989775e-09, 2.5836933e-09, 3.9658077e-09]], dtype=float32)>


# [S,b,e] shaped input where [b,e] can be broadcast agaisnt [B,E]
sample = tf.random.uniform((2, 1, 6))
another_batch_of_mvt_normals.prob(sample)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[3.4162173e-09, 2.2252702e-09, 3.4162173e-09],
       [2.4587419e-09, 3.3255294e-09, 2.4587419e-09]], dtype=float32)>


# Use broadcasting with the log_prob method
# [S,b,e] shaped input where [b,e] can be broadcast agaisnt [B,E]
sample = tf.random.uniform((2, 3, 1))
another_batch_of_mvt_normals.prob(sample)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[4.0049937e-09, 3.6004075e-09, 2.5192635e-09],
       [4.0302814e-09, 2.1555266e-09, 2.2629485e-09]], dtype=float32)>


import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

print('TF version:', tf.__version__)
print('TFP version:', tfp.__version__)

TF version: 2.3.0
TFP version: 0.11.0


# Additional imports and setting fixed random seed to have reproducibility
import matplotlib.pyplot as plt
import numpy as np
tf.random.set_seed(123)


# Create a normal distribution from Tensorflow Distributions
normal = tfd.Normal(loc = 0, scale = 1)
normal

<tfp.distributions.Normal 'Normal' batch_shape=[] event_shape=[] dtype=float32>


# Sample from the chosen distribution...
normal.sample()

<tf.Tensor: shape=(), dtype=float32, numpy=-0.8980837>


# ... or sample multiple times
normal.sample(10)

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.33875433,  0.3449861 , -0.6605785 , -0.28549942,  0.43852386,
        0.8288566 , -0.53591555, -0.53534836, -1.0324249 , -2.942705  ],
      dtype=float32)>


# Obtain value of probability's density
normal.prob(0)

<tf.Tensor: shape=(), dtype=float32, numpy=0.3989423>


# Obtain value of logprobability
normal.log_prob(0)

<tf.Tensor: shape=(), dtype=float32, numpy=-0.9189385>


# Verify that this really is the log of the probability
np.log(normal.prob(0))

-0.9189385


# Plot a histogram, approximating the density
plt.hist(x = normal.sample(10000).numpy(), bins = 50, density = True)
plt.show()


# Do the same for the exponential distribution
exponential = tfd.Exponential(rate = 1)
plt.hist(x = exponential.sample(10000).numpy(), bins = 50, density = True)
plt.show()


# Sample as before
exponential.sample(10)

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([0.3179616 , 0.9595855 , 0.6190708 , 1.3738598 , 1.6796894 ,
       0.40142855, 1.6830153 , 1.78942   , 0.38126466, 0.5528394 ],
      dtype=float32)>


# Create a Bernoulli distribution (discrete)
bernoulli = tfd.Bernoulli(probs = 0.8)
bernoulli.sample(20)

<tf.Tensor: shape=(20,), dtype=int32, numpy=
array([0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1],
      dtype=int32)>


# Calculate Bernoulli prob and see that 0.5 and -1 do not give the correct probability!
for k in [0, 0.5, 1, -1]:
    print('prob result {} for k = {} '.format(bernoulli.prob(k), k))

prob result 0.20000000298023224 for k = 0 
prob result 0.4000000059604645 for k = 0.5 
prob result 0.800000011920929 for k = 1 
prob result 0.04999999701976776 for k = -1


# Replicate the scores to see what is occurring under the hood
def my_bernoulli(p_success, k):
    return np.power(p_success, k) * np.power((1 - p_success), (1 - k))


# Evaluate it as before
for k in [0, 0.5, 1, -1]:
    print('prob result {} for k = {} '.format(my_bernoulli(p_success = 0.8, k = k), k))

prob result 0.19999999999999996 for k = 0 
prob result 0.3999999999999999 for k = 0.5 
prob result 0.8 for k = 1 
prob result 0.049999999999999975 for k = -1


# Create a batched Bernoulli distribution
bernoulli_batch = tfd.Bernoulli(probs = [0.1, 0.25, 0.5, 0.75, 0.9])
bernoulli_batch

<tfp.distributions.Bernoulli 'Bernoulli' batch_shape=[5] event_shape=[] dtype=int32>


# Sample from it, noting the shape
bernoulli_batch.sample(5)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[0, 0, 0, 1, 1],
       [0, 1, 0, 1, 1],
       [0, 0, 1, 1, 1],
       [0, 0, 0, 1, 1],
       [0, 0, 0, 1, 1]], dtype=int32)>


# Use a batch shape with higher rank
probs = [[[0.5, 0.5],  [0.8, 0.3],  [0.25, 0.75]]]
bernoulli_batch_2D = tfd.Bernoulli(probs = probs)
bernoulli_batch_2D

<tfp.distributions.Bernoulli 'Bernoulli' batch_shape=[1, 3, 2] event_shape=[] dtype=int32>


# Sample from this batch of distributions
bernoulli_batch_2D.sample(5)

<tf.Tensor: shape=(5, 1, 3, 2), dtype=int32, numpy=
array([[[[0, 0],
         [1, 1],
         [0, 1]]],


       [[[0, 0],
         [1, 0],
         [0, 1]]],


       [[[1, 0],
         [1, 1],
         [0, 1]]],


       [[[1, 0],
         [0, 0],
         [0, 1]]],


       [[[0, 1],
         [1, 1],
         [1, 0]]]], dtype=int32)>


# Determine probabilities from this batch distribution
bernoulli_batch_2D.prob([[[1, 0], [0, 0], [1, 1]]])

<tf.Tensor: shape=(1, 3, 2), dtype=float32, numpy=
array([[[0.5      , 0.5      ],
        [0.2      , 0.6999999],
        [0.25     , 0.75     ]]], dtype=float32)>


# Define 2D multivariate Gaussian with diagonal covariance matrix
normal_diag = tfd.MultivariateNormalDiag(loc = [0, 1], scale_diag = [1, 2])
normal_diag

<tfp.distributions.MultivariateNormalDiag 'MultivariateNormalDiag' batch_shape=[] event_shape=[2] dtype=float32>


# Sample from it
normal_diag.sample(10)

<tf.Tensor: shape=(10, 2), dtype=float32, numpy=
array([[-0.37992278,  2.3674164 ],
       [-2.224005  , -0.28514457],
       [ 0.923083  , -1.4528892 ],
       [-0.62774605, -0.33852375],
       [-0.6252951 , -1.3324146 ],
       [-0.42454168,  1.3192185 ],
       [-1.702882  ,  1.8533869 ],
       [-0.4608376 , -0.7023523 ],
       [-1.1919353 , -0.12865639],
       [ 0.48053816, -0.2693485 ]], dtype=float32)>


# Make a plot
plt_sample = normal_diag.sample(10000)
plt.scatter(plt_sample[:, 0], plt_sample[:, 1], marker = '.', alpha = 0.05)
plt.axis('equal')
plt.show()


# Create three "batches" of multivariate normals
normal_diag_batch = tfd.MultivariateNormalDiag \
    (loc = [[0, 0], [0, 0], [0, 0]], scale_diag = [[1, 2], [2, 1], [2, 2]])
normal_diag_batch

<tfp.distributions.MultivariateNormalDiag 'MultivariateNormalDiag' batch_shape=[3] event_shape=[2] dtype=float32>


# Sample from it
samples = normal_diag_batch.sample(5)
samples

<tf.Tensor: shape=(5, 3, 2), dtype=float32, numpy=
array([[[-0.8012545 , -2.128108  ],
        [ 2.0774972 , -2.7921855 ],
        [ 0.52665955,  0.60957587]],

       [[ 0.9923561 , -0.9778331 ],
        [-0.8376892 ,  0.70630246],
        [-1.0894657 , -0.65969497]],

       [[-1.6264789 ,  2.2429497 ],
        [-4.301875  , -0.7626804 ],
        [-0.4345196 , -0.57022095]],

       [[ 0.73075646,  2.835662  ],
        [ 1.8173586 ,  1.2079152 ],
        [-3.2939956 ,  2.33647   ]],

       [[-0.24759005,  0.56306183],
        [-0.6053428 ,  0.06578209],
        [ 1.4922864 , -0.55439734]]], dtype=float32)>


# Compute log probs
normal_diag_batch.log_prob(samples)

<tf.Tensor: shape=(5, 3), dtype=float32, numpy=
array([[-3.4181342, -6.9686737, -3.3052907],
       [-3.142929 , -2.8681712, -3.426938 ],
       [-4.482594 , -5.1351314, -3.2884164],
       [-3.803149 , -3.6734028, -5.262859 ],
       [-2.6013045, -2.5789928, -3.5409558]], dtype=float32)>


# Create a sample for a plot -- notice the shape
plt_sample_batch = normal_diag_batch.sample(10000)
plt_sample_batch.shape

TensorShape([10000, 3, 2])


# Plot samples from the batched multivariate Gaussian
fig, axs = (plt.subplots(1, 3, sharex = True, sharey = True, figsize = (10, 3)))
titles = ['cov_diag = [1, 2]','cov_diag = [2, 1]', 'cov_diag = [2, 2]']

for i, (ax, title) in enumerate(zip(axs, titles)):
    samples = plt_sample_batch[:, i, :] #take the ith batch [samples x event_shape]
    ax.scatter(samples[:, 0], samples[:, 1], marker = '.', alpha = 0.05)
    ax.set_title(title)
plt.show()


# Start by defining a batch of two univariate Gaussians, then
# combine them into a bivariate Gaussian with independent components
locs = [-1., 1]
scales = [0.5, 1.]
batch_of_normals = tfd.Normal(loc = locs, scale = scales)


# Univariate density functions
import seaborn as sns

t = np.linspace(-4, 4, 10000)
densities = batch_of_normals.prob(np.repeat(t[:, np.newaxis], 2, axis = 1)) # each column is a vector of densities for one distn

sns.lineplot(t, densities[:, 0], label = 'loc = {}, scale = {}'.format(locs[0], scales[0]))
sns.lineplot(t, densities[:, 1], label = 'loc = {}, scale = {}'.format(locs[1], scales[1]))
plt.ylabel('Probability density')
plt.xlabel('Value')
plt.legend()
plt.show()

/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning


# Check their batch_shape and event_shape
batch_of_normals

<tfp.distributions.Normal 'Normal' batch_shape=[2] event_shape=[] dtype=float32>


# Use Independent to convert the batch shape to the event shape
bivariate_normal_from_Independent = tfd.Independent(batch_of_normals, reinterpreted_batch_ndims = 1)


# Note that dimension from batch_shape has shifted to event_shape
bivariate_normal_from_Independent

<tfp.distributions.Independent 'IndependentNormal' batch_shape=[] event_shape=[2] dtype=float32>


# Create a plot showing joint density contours and marginal density functions
samples = bivariate_normal_from_Independent.sample(10000)
x1 = samples[:, 0]
x2 = samples[:, 1]
sns.jointplot(x1, x2, kind = 'kde', space = 0, color = 'b', xlim = [-4, 4], ylim = [-4, 4])

/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

<seaborn.axisgrid.JointGrid at 0x7f2dbec1d190>


# Use MultivariateNormalDiag to create the equivalent distribution
# Note that diagonal covariance mat rix => no correlation => independence
# (for the multivariate normal distribution)
bivariate_normal_from_Multivariate = tfd.MultivariateNormalDiag(loc = locs, scale_diag = scales)
bivariate_normal_from_Multivariate

<tfp.distributions.MultivariateNormalDiag 'MultivariateNormalDiag' batch_shape=[] event_shape=[2] dtype=float32>


# Plot the joint density function of bivariate_normal_from_Independent
# Refer back to bivariate_normal_from_Independent to show that the plot is the same
# Summarise how Independent has been used
samples = bivariate_normal_from_Multivariate.sample(10000)
x1 = samples[:, 0]
x2 = samples[:, 1]
sns.jointplot(x1, x2, kind = 'kde', space = 0, color = 'b', xlim = [-4, 4], ylim = [-4, 4])

/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

<seaborn.axisgrid.JointGrid at 0x7f2dbec1d9d0>


# Demonstrate use of reinterpreted_batch_ndims
# By default all batch dims except the first are transferred to event dims
loc_grid = [[-100., -100.], [100., 100.], [0., 0.]]
scale_grid = [[1., 10.], [1., 10.], [1., 1.]]
normals_batch_3by2_event_1 = tfd.Normal(loc = loc_grid, scale = scale_grid)


# Highlight batch_shape
normals_batch_3by2_event_1

<tfp.distributions.Normal 'Normal' batch_shape=[3, 2] event_shape=[] dtype=float32>


# We now have a batch of 3 bivariate normal distributions,
# each parametrised by a column of our original parameter grid
normals_batch_3_event_2 = tfd.Independent(normals_batch_3by2_event_1)
normals_batch_3_event_2

<tfp.distributions.Independent 'IndependentNormal' batch_shape=[3] event_shape=[2] dtype=float32>


# Evaluate log_prob
normals_batch_3_event_2.log_prob(value = [[-10., 10.], [100., 100.], [1., 1.]])

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([-4.1146406e+03, -4.1404624e+00, -2.8378770e+00], dtype=float32)>


# Can reinterpret _all_ batch dimensions as event dimensions
normals_batch_1_event_3by2 = tfd.Independent \
    (normals_batch_3by2_event_1, reinterpreted_batch_ndims = 2)
normals_batch_1_event_3by2

<tfp.distributions.Independent 'IndependentNormal' batch_shape=[] event_shape=[3, 2] dtype=float32>


# Take log_probs 
normals_batch_1_event_3by2.log_prob(value = [[-10., 10.], [100., 100.], [1., 1.]])

<tf.Tensor: shape=(), dtype=float32, numpy=-4121.619>


# Convenience function for retrieving the 20 newsgroups data set

# Usenet was a forerunner to modern internet forums
# Users could post and read articles
# Newsgroup corresponded to a topic
# Example topics in this data set: IBM computer hardware, baseball
# Our objective is to use an article's contents to predict its newsgroup,
# a 20-class classification problem.

# 18000 newsgroups, posts on 20 topics
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer


# Get the train data
newsgroups_data = fetch_20newsgroups(data_home = '20_Newsgroup_Data/', subset = 'train')


# More information about the data set
print(newsgroups_data['DESCR'][:1000])

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    =================   ==========
    Classes                     20
    Samples total            18846
    Dimensionality


# Example article
print(newsgroups_data['data'][0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----


# Associated label
newsgroups_data['target'][0]

7


# Name of label
newsgroups_data['target_names'][7]

'rec.autos'


# Preprocessing boilerplate
n_documents = len(newsgroups_data['data'])
count_vectorizer = CountVectorizer \
    (input = 'content', binary = True, max_df = 0.25, min_df = 1.01 / n_documents) # ignore common words, words that appear once

# input is a list of strings
binary_bag_of_words = count_vectorizer.fit_transform(newsgroups_data['data'])


# Check shape
binary_bag_of_words.shape

(11314, 56365)


# Check that the fit has been successful
count_vectorizer.inverse_transform(binary_bag_of_words[0, :])

[array(['lerxst', 'wam', 'umd', 'where', 'thing', 'car', 'rac3',
        'maryland', 'college', 'park', '15', 'wondering', 'anyone',
        'could', 'enlighten', 'saw', 'day', 'door', 'sports', 'looked',
        'late', '60s', 'early', '70s', 'called', 'bricklin', 'doors',
        'were', 'really', 'small', 'addition', 'front', 'bumper',
        'separate', 'rest', 'body', 'tellme', 'model', 'name', 'engine',
        'specs', 'years', 'production', 'made', 'history', 'whatever',
        'info', 'funky', 'looking', 'please', 'mail', 'thanks', 'il',
        'brought', 'neighborhood'], dtype='<U80')]


# Dict that will be useful later 
inv_vocabulary = {value:key for key, value in count_vectorizer.vocabulary_.items()}


# Compute the parameter estimates (adjusted fraction of documents in class that contain word)
n_classes = newsgroups_data['target'].max() + 1
y = newsgroups_data['target']
n_words = binary_bag_of_words.shape[1]

alpha = 1e-6 # parameters for Laplace smoothing

theta = np.zeros([n_classes, n_words]) # stores parameter values - prob. word given class
for c_k in range(n_classes): # 0, 1, ..., 19
    class_mask = (y == c_k)
    N = class_mask.sum() # number of articles in class
    theta[c_k, :] = (binary_bag_of_words[class_mask, :].sum(axis = 0) + alpha) / (N + alpha * 2)


# Check whether the most probable word in each class is reasonable
most_probable_word_ix = theta.argmax(axis = 1) # most probable word for each class

for j, ix in enumerate(most_probable_word_ix):
    print (
        'Most probable word in class {} is "{}".'.format \
        (newsgroups_data['target_names'][j], inv_vocabulary[ix])
    )

Most probable word in class alt.atheism is "people".
Most probable word in class comp.graphics is "graphics".
Most probable word in class comp.os.ms-windows.misc is "windows".
Most probable word in class comp.sys.ibm.pc.hardware is "thanks".
Most probable word in class comp.sys.mac.hardware is "mac".
Most probable word in class comp.windows.x is "window".
Most probable word in class misc.forsale is "sale".
Most probable word in class rec.autos is "car".
Most probable word in class rec.motorcycles is "dod".
Most probable word in class rec.sport.baseball is "he".
Most probable word in class rec.sport.hockey is "ca".
Most probable word in class sci.crypt is "clipper".
Most probable word in class sci.electronics is "use".
Most probable word in class sci.med is "reply".
Most probable word in class sci.space is "space".
Most probable word in class soc.religion.christian is "god".
Most probable word in class talk.politics.guns is "people".
Most probable word in class talk.politics.mideast is "people".
Most probable word in class talk.politics.misc is "people".
Most probable word in class talk.religion.misc is "he".


# Define a distribution for each class
batch_of_bernoullis = tfd.Bernoulli(probs = theta)
p_x_given_y = tfd.Independent(batch_of_bernoullis, reinterpreted_batch_ndims = 1)
p_x_given_y

<tfp.distributions.Independent 'IndependentBernoulli' batch_shape=[20] event_shape=[56365] dtype=int32>


# Take a sample of words from each class
n_samples = 10
sample = p_x_given_y.sample(n_samples)
sample.shape

TensorShape([10, 20, 56365])


# Choose a class
chosen_class = 15
newsgroups_data['target_names'][chosen_class]

'soc.religion.christian'


# Indicators for words that appear in the sample
class_sample = sample[:, chosen_class, :]
class_sample

<tf.Tensor: shape=(10, 56365), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>


# Perform inverse transform to test quality of fit
count_vectorizer.inverse_transform(class_sample)[0]

array(['09', '19', '23', '28', '31', '34', '35', '41', '78228', 'acs',
       'actually', 'advise', 'ages', 'also', 'andrew', 'anyone',
       'anything', 'appease', 'apr', 'bathroom', 'bc', 'because',
       'beggar', 'believe', 'better', 'bigelow', 'brains', 'chest',
       'child', 'claim', 'cleaned', 'commented', 'concepts', 'country',
       'day', 'daycare', 'death', 'decided', 'definition', 'detailed',
       'difference', 'disciple', 'discuss', 'does', 'emotional',
       'encourage', 'even', 'excellent', 'explain', 'faith', 'familiar',
       'fellowship', 'forgiven', 'free', 'garnet', 'gentiles', 'go',
       'god', 'good', 'growing', 'guess', 'hear', 'heard', 'here', 'him',
       'hold', 'honoring', 'however', 'ignore', 'kind', 'knowing', 'last',
       'lawrence', 'least', 'legal', 'less', 'life', 'minds', 'moment',
       'monotheism', 'must', 'never', 'now', 'offers', 'original', 'our',
       'over', 'pa', 'path', 'perception', 'person', 'personally', 'phil',
       'political', 'predestination', 'previously', 'problem', 'problems',
       'prove', 'punish', 'questionnaire', 'qui', 'quite', 'quote',
       'quotes', 'reading', 'real', 'reasonable', 'requirement', 'result',
       'reversed', 'say', 'second', 'sense', 'shaken', 'small', 'sons',
       'souls', 'span', 'speaking', 'spiritually', 'ssd', 'statements',
       'stem', 'still', 'study', 'take', 'talk', 'testament', 'than',
       'their', 'them', 'then', 'theories', 'these', 'things', 'think',
       'through', 'trying', 'two', 'uk', 'understand', 'us', 'uu4',
       'verse', 'very', 'views', 'vituperousness', 'wasn', 'watt', 'way',
       'well', 'whose', 'why', 'willing', 'wisdom', 'words', 'world'],
      dtype='<U80')


# Make Multivariate Distribution
normal_distributions = tfd.MultivariateNormalDiag (
    loc = [[0.5, 1], [0.1, 0], [0, 0.2]],
    scale_diag = [[2, 3], [1, 3], [4, 4]]
)
normal_distributions

<tfp.distributions.MultivariateNormalDiag 'MultivariateNormalDiag' batch_shape=[3] event_shape=[2] dtype=float32>


# Sample
normal_distributions.sample(5)

<tf.Tensor: shape=(5, 3, 2), dtype=float32, numpy=
array([[[-0.61642456, -2.6508875 ],
        [-0.02484179, -0.58626443],
        [-2.384066  , -2.2478259 ]],

       [[-1.3702699 ,  9.627965  ],
        [ 0.34466884, -2.335009  ],
        [-0.6538167 ,  5.7416673 ]],

       [[-2.086178  ,  4.7825446 ],
        [ 1.154147  , -4.501431  ],
        [-4.3120627 ,  0.32013714]],

       [[-2.5057547 ,  2.4638143 ],
        [-0.11568717, -5.0724864 ],
        [ 0.63433516, -0.52497053]],

       [[ 1.380878  ,  1.7464799 ],
        [ 0.11383934,  1.1534938 ],
        [-0.91430813,  2.0012252 ]]], dtype=float32)>


# Multivariate Normal batched Distribution
# We are broadcasting batch shapes of `loc` and `scal_diag` 
# against each other
loc = [[[0.3, 1.5, 1.], [0.2, 0.4, 2.8]], [[2., 2.3, 8], [1.4, 1, 1.3]]]
scale_diag = [0.4, 1., 0.7]
normal_distributions = tfd.MultivariateNormalDiag(loc = loc, scale_diag = scale_diag)
normal_distributions

<tfp.distributions.MultivariateNormalDiag 'MultivariateNormalDiag' batch_shape=[2, 2] event_shape=[3] dtype=float32>


# Use independent to move part of the batch shape
ind_normal_distributions = tfd.Independent(normal_distributions, reinterpreted_batch_ndims = 1)
ind_normal_distributions

<tfp.distributions.Independent 'IndependentMultivariateNormalDiag' batch_shape=[2] event_shape=[2, 3] dtype=float32>


# Draw some samples
samples = ind_normal_distributions.sample(5)
samples.shape

TensorShape([5, 2, 2, 3])


# `[B, E]` shaped input
inp = tf.random.uniform((2, 2, 3))
ind_normal_distributions.log_prob(inp)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([-11.756409, -66.098785], dtype=float32)>


# `[E]` shaped input (broadcasting over batch size)
inp = tf.random.uniform((2, 3))
ind_normal_distributions.log_prob(inp)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([-11.850761, -76.12896 ], dtype=float32)>


#`[S, B, E]` shaped input (broadcasting over samples)
inp = tf.random.uniform((9, 2, 2, 3))
ind_normal_distributions.log_prob(inp)

<tf.Tensor: shape=(9, 2), dtype=float32, numpy=
array([[ -8.763502 , -74.403946 ],
       [ -8.963752 , -78.71576  ],
       [-10.053402 , -67.10482  ],
       [-10.113637 , -70.84384  ],
       [ -8.458472 , -77.99654  ],
       [-13.988488 , -74.964966 ],
       [ -7.66702  , -65.602325 ],
       [-10.51793  , -65.675446 ],
       [-11.9138565, -72.61048  ]], dtype=float32)>


# `[S, b, e]` shaped input, where [b, e] is broadcastable over [B, E]
inp = tf.random.uniform((5, 1, 2, 1))
ind_normal_distributions.log_prob(inp)

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[ -9.843089 , -61.68399  ],
       [-10.387953 , -61.109745 ],
       [ -9.2092285, -71.7383   ],
       [-12.003781 , -83.812454 ],
       [ -9.123208 , -68.52439  ]], dtype=float32)>


from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score


# Making a function get_data which:
#   1) Fetches the 20 newsgroup dataset
#   2) Performs a word count on the articles and binarizes the result
#   3) Returns the data as a numpy matrix with the labels
def get_data(categories):
    
    newsgroups_train_data = fetch_20newsgroups \
        (data_home = '20_Newsgroup_Data/', subset = 'train', categories = categories)
    newsgroups_test_data = fetch_20newsgroups \
        (data_home = '20_Newsgroup_Data/', subset = 'test', categories = categories)

    n_documents = len(newsgroups_train_data['data'])
    count_vectorizer = CountVectorizer \
        (input = 'content', binary = True, max_df = 0.25, min_df = 1.01 / n_documents)
    
    train_binary_bag_of_words = count_vectorizer.fit_transform(newsgroups_train_data['data'])
    test_binary_bag_of_words = count_vectorizer.transform(newsgroups_test_data['data']) 

    return (train_binary_bag_of_words.todense(), newsgroups_train_data['target']), \
           (test_binary_bag_of_words.todense(), newsgroups_test_data['target'])


# Defining a function to conduct Laplace smoothing.
# This adds a base level of probability for a given feature to occur in every class.
def laplace_smoothing(labels, binary_data, n_classes):
    # Compute the parameter estimates
    # (adjusted fraction of documents in class that contain word)
    n_words = binary_data.shape[1]
    alpha = 1 # parameters for Laplace smoothing
    theta = np.zeros([n_classes, n_words]) # stores parameter values - prob. word given class
    for c_k in range(n_classes): # 0, 1, ..., 19
        class_mask = (labels == c_k)
        N = class_mask.sum() # number of articles in class
        theta[c_k, :] = (binary_data[class_mask, :].sum(axis = 0) + alpha) / (N + alpha * 2)

    return theta


# Getting a subset of the 20 newsgroup dataset
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

(train_data, train_labels), (test_data, test_labels) = get_data(categories = categories)
smoothed_counts = laplace_smoothing \
    (labels = train_labels, binary_data = train_data, n_classes = len(categories))


# Function which computes the prior probability of every class
# based on frequency of occurence in the dataset
def class_priors(n_classes, labels):
    counts = np.zeros(n_classes)
    for c_k in range(n_classes):
        counts[c_k] = np.sum(np.where(labels == c_k, 1, 0))
    priors = counts / np.sum(counts)
    print('The class priors are {}'.format(priors))
    return priors


# Run the function 
priors = class_priors(n_classes = len(categories), labels = train_labels)

The class priors are [0.2359882  0.28711898 0.29154376 0.18534907]


# Now we will do a function that given the feature occurence counts returns a Bernoulli distribution of 
# batch_shape=number of classes and event_shape=number of features.
def make_distribution(probs):
    batch_of_bernoullis = tfd.Bernoulli(probs = probs)
    dist = tfd.Independent(batch_of_bernoullis, reinterpreted_batch_ndims = 1)
    return dist

tf_dist = make_distribution(smoothed_counts)
tf_dist

<tfp.distributions.Independent 'IndependentBernoulli' batch_shape=[4] event_shape=[17495] dtype=int32>


# The final function predict_sample which given the distribution, a test sample, and the class priors:
#   1) Computes the class conditional probabilities given the sample
#   2) Forms the joint likelihood
#   3) Normalises the joint likelihood and returns the log prob

def predict_sample(dist, sample, priors):
    cond_probs = dist.log_prob(sample)
    joint_likelihood = tf.add(np.log(priors), cond_probs)
    norm_factor = tf.math.reduce_logsumexp(joint_likelihood, axis = -1, keepdims = True)
    log_prob = joint_likelihood - norm_factor

    return log_prob


# Predicting one example from our test data
log_probs = predict_sample(tf_dist, test_data[0], priors)
log_probs

<tf.Tensor: shape=(4,), dtype=float32, numpy=
array([-6.1736343e+01, -1.5258789e-05, -1.1620026e+01, -6.3327866e+01],
      dtype=float32)>


# Loop over our test data and classify.
probabilities = []
for sample, label in zip(test_data, test_labels):
    probabilities.append(tf.exp(predict_sample(tf_dist, sample, priors)))

probabilities = np.asarray(probabilities)
predicted_classes = np.argmax(probabilities, axis = -1)
print('f1 ', f1_score(test_labels, predicted_classes, average = 'macro'))

f1  0.7848499112849504


# Make a Bernoulli Naive Bayes classifier using sklearn with the same level of alpha smoothing. 
clf = BernoulliNB(alpha = 1)
clf.fit(train_data, train_labels)
pred = clf.predict(test_data)
print('f1 from sklean ', f1_score(test_labels, pred, average = 'macro'))

f1 from sklean  0.7848499112849504


from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score


# Define an exponential distribution
exponential = tfd.Exponential(rate = 0.3, name = 'exp')


# Plot
plt.hist(exponential.sample(5000).numpy(), bins = 100, density = True)
plt.show()


# Define an exponential distribution with a trainable rate parameter
exp_train = tfd.Exponential(rate = tf.Variable(1., name = 'rate'), name = 'exp_train')
exp_train.trainable_variables

(<tf.Variable 'rate:0' shape=() dtype=float32, numpy=1.0>,)


# Define the negative log likelihood
def nll(x_train, distribution):
    return -tf.reduce_mean(distribution.log_prob(x_train))


# Define a function to compute the loss and gradients
@tf.function
def get_loss_and_grads(x_train, distribution):
    with tf.GradientTape() as tape:
        tape.watch(distribution.trainable_variables)
        loss = nll(x_train, distribution)
        grads = tape.gradient(loss, distribution.trainable_variables)
    return loss, grads


# Optimize
def exponential_dist_optimisation(data, distribution):

    # Keep results for plotting
    train_loss_results = []
    train_rate_results = []
    optimizer = tf.keras.optimizers.SGD(learning_rate = 0.05)
    num_steps = 10

    for i in range(num_steps):
        loss, grads = get_loss_and_grads(data, distribution)
        optimizer.apply_gradients(zip(grads, distribution.trainable_variables))
        
        rate_value = distribution.rate.value()
        train_loss_results.append(loss)
        train_rate_results.append(rate_value)
        print("Step {:03d}: Loss: {:.3f}: Rate: {:.3f}".format(i, loss, rate_value))
        
    return train_loss_results, train_rate_results


# Get some data and train
sampled_data = exponential.sample(5000)
train_loss_results, train_rate_results = exponential_dist_optimisation \
    (data = sampled_data, distribution = exp_train)

Step 000: Loss: 3.377: Rate: 0.881
Step 001: Loss: 3.102: Rate: 0.769
Step 002: Loss: 2.860: Rate: 0.665
Step 003: Loss: 2.654: Rate: 0.572
Step 004: Loss: 2.490: Rate: 0.490
Step 005: Loss: 2.368: Rate: 0.423
Step 006: Loss: 2.289: Rate: 0.373
Step 007: Loss: 2.246: Rate: 0.338
Step 008: Loss: 2.226: Rate: 0.317
Step 009: Loss: 2.219: Rate: 0.306


# Predicted value for the rate parameter
pred_value = exp_train.rate.numpy()
exact_value = exponential.rate.numpy()

print('Exact rate: ', exact_value)
print('Pred rate:  ', pred_value)

Exact rate:  0.3
Pred rate:   0.3058872


# Plot to see the convergence of the estimated and true parameters
tensor_exact_value = tf.constant(exact_value, shape = [len(train_rate_results)])

fig, axes = plt.subplots(2, sharex = True, figsize = (12, 8))
fig.suptitle('Convergence')

axes[0].set_ylabel("Loss", fontsize = 14)
axes[0].plot(train_loss_results)

axes[1].set_ylabel("Rate", fontsize = 14)
axes[1].set_xlabel("Epoch", fontsize = 14)
axes[1].plot(train_rate_results, label = 'trainable rate variable')
axes[1].plot(tensor_exact_value, label = 'exact rate')
axes[1].legend()
plt.show()


# Making a function get_data which:
#   1) Fetches the 20 newsgroup dataset
#   2) Performs a word count on the articles and binarizes the result
#   3) Returns the data as a numpy matrix with the labels
def get_data(categories):

    newsgroups_train_data = fetch_20newsgroups \
        (data_home = '20_Newsgroup_Data/', subset = 'train', categories = categories)
    newsgroups_test_data = fetch_20newsgroups \
        (data_home = '20_Newsgroup_Data/', subset = 'test', categories = categories)

    n_documents = len(newsgroups_train_data['data'])
    count_vectorizer = CountVectorizer \
        (input = 'content', binary = True, max_df = 0.25, min_df = 1.01 / n_documents) 
    train_binary_bag_of_words = count_vectorizer.fit_transform(newsgroups_train_data['data']) 
    test_binary_bag_of_words = count_vectorizer.transform(newsgroups_test_data['data']) 

    return (train_binary_bag_of_words.todense(), newsgroups_train_data['target']), \
           (test_binary_bag_of_words.todense(), newsgroups_test_data['target'])


# Defining a function to conduct laplace smoothing. This adds a base level of probability for a given feature
# to occur in every class.
def laplace_smoothing(labels, binary_data, n_classes):
    # Compute the parameter estimates (adjusted fraction of documents in class that contain word)
    n_words = binary_data.shape[1]
    alpha = 1 # parameters for Laplace smoothing
    theta = np.zeros([n_classes, n_words]) # stores parameter values - prob. word given class
    for c_k in range(n_classes): # 0, 1, ..., 19
        class_mask = (labels == c_k)
        N = class_mask.sum() # number of articles in class
        theta[c_k, :] = (binary_data[class_mask, :].sum(axis = 0) + alpha) / (N + alpha * 2)

    return theta


# Now we will do a function that given the feature occurence counts returns a Bernoulli distribution of 
# batch_shape=number of classes and event_shape=number of features.
def make_distributions(probs):
    batch_of_bernoullis = tfd.Bernoulli(probs = probs) # shape (n_classes, n_words)
    dist = tfd.Independent(batch_of_bernoullis, reinterpreted_batch_ndims = 1)
    return dist


# Function which computes the prior probability of every class based on frequency of occurence in 
# the dataset
def class_priors(n_classes, labels):
    counts = np.zeros(n_classes)
    for c_k in range(n_classes):
        counts[c_k] = np.sum(np.where(labels == c_k, 1, 0))
    priors = counts / np.sum(counts)
    print('The class priors are {}'.format(priors))
    return priors


# The final function predict_sample which given the distribution, a test sample, and the class priors:
#   1) Computes the class conditional probabilities given the sample
#   2) Forms the joint likelihood
#   3) Normalises the joint likelihood and returns the log prob
def predict_sample(dist, sample, priors):
    cond_probs = dist.log_prob(sample)
    joint_likelihood = tf.add(np.log(priors), cond_probs)
    norm_factor = tf.math.reduce_logsumexp(joint_likelihood, axis = -1, keepdims = True)
    log_prob = joint_likelihood - norm_factor
    return log_prob


# Now we learn the distribution using gradient tape
def make_distribution_withGT(data, labels, nb_classes):

    class_data = []
    train_vars = []
    distributions = []
    for c in range(nb_classes):
        train_vars.append(tf.Variable \
            (initial_value=np.random.uniform(low = 0.01, high = 0.1, size = data.shape[-1])))
        distributions.append(tfd.Bernoulli(probs = train_vars[c]))
        class_mask = (labels == c)
        class_data.append(data[class_mask, :])

    for c_num in range(0,nb_classes):
        optimizer = tf.keras.optimizers.Adam()
        print('\n%-------------------%')
        print('Class ', c_num)
        print('%-------------------%')

        for i in range(0, 100):
            loss, grads = get_loss_and_grads(class_data[c_num], distributions[c_num])
            if i % 10 == 0:
                print('iter: {} loss: {}'.format(i, loss))
            optimizer.apply_gradients(zip(grads, distributions[c_num].trainable_variables))
            eta = 1e-3
            clipped_probs = tf.clip_by_value \
                (distributions[c_num].trainable_variables, clip_value_min = eta, clip_value_max = 1)
            train_vars[c_num] = tf.squeeze(clipped_probs)

    dist = tfd.Bernoulli(probs = train_vars)
    dist = tfd.Independent(dist, reinterpreted_batch_ndims = 1)
    print(dist)
    return dist


# Make the same Naive Bayes classifier we did last tutorial
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
(train_data, train_labels), (test_data, test_labels) = get_data(categories)

smoothed_counts = laplace_smoothing \
    (labels = train_labels, binary_data = train_data, n_classes = len(categories))
priors = class_priors(n_classes = len(categories), labels = train_labels)
tf_dist = make_distributions(smoothed_counts)

The class priors are [0.2359882  0.28711898 0.29154376 0.18534907]


# Now train the distributions with gradient tape
GT_dist = make_distribution_withGT(data = train_data, labels = train_labels, nb_classes = 4)

%-------------------%
Class  0
%-------------------%
iter: 0 loss: 0.07829254456005681
iter: 10 loss: 0.06899232657607354
iter: 20 loss: 0.06031953688434695
iter: 30 loss: 0.05228091542158744
iter: 40 loss: 0.044827120954877314
iter: 50 loss: 0.037923124902828766
iter: 60 loss: 0.03153174284920682
iter: 70 loss: 0.02562061911389209
iter: 80 loss: 0.020148650299908685
iter: 90 loss: 0.015078609758130055

%-------------------%
Class  1
%-------------------%
iter: 0 loss: 0.07146276037880028
iter: 10 loss: 0.06212357453199265
iter: 20 loss: 0.05334094127326725
iter: 30 loss: 0.0451854947433399
iter: 40 loss: 0.03763059468595021
iter: 50 loss: 0.030644072872020898
iter: 60 loss: 0.024187183809748204
iter: 70 loss: 0.018231306796357812
iter: 80 loss: 0.01273630114869445
iter: 90 loss: 0.00764733326735143

%-------------------%
Class  2
%-------------------%
iter: 0 loss: 0.07831977980297099
iter: 10 loss: 0.06920893765933767
iter: 20 loss: 0.060811155571121364
iter: 30 loss: 0.05315205171401028
iter: 40 loss: 0.04618726901559432
iter: 50 loss: 0.03988330075349113
iter: 60 loss: 0.034195962461422966
iter: 70 loss: 0.029088565905427646
iter: 80 loss: 0.02450239545572051
iter: 90 loss: 0.020364445161358293

%-------------------%
Class  3
%-------------------%
iter: 0 loss: 0.07956402386025159
iter: 10 loss: 0.0703239091751621
iter: 20 loss: 0.06169454189197293
iter: 30 loss: 0.05368818463089667
iter: 40 loss: 0.04627427900950802
iter: 50 loss: 0.039410224060753805
iter: 60 loss: 0.03304793772691462
iter: 70 loss: 0.027140914093159326
iter: 80 loss: 0.021659204942776244
iter: 90 loss: 0.016548925857701124
tfp.distributions.Independent("IndependentBernoulli", batch_shape=[4], event_shape=[17495], dtype=int32)


# Compare the two results
for dist in [GT_dist,tf_dist]:
    probabilities = []
    for sample, label in zip(test_data, test_labels):
        probabilities.append(predict_sample(dist, sample, priors))

    probabilities = np.asarray(probabilities)
    predicted_classes = np.argmax(probabilities, axis = -1)
    print('f1 ', f1_score(test_labels, predicted_classes, average = 'macro'))

f1  0.8331666344676952
f1  0.7848499112849504


import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
tfpl = tfp.layers

print('TF version:', tf.__version__)
print('TFP version:', tfp.__version__)

TF version: 2.3.0
TFP version: 0.11.0


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop
import numpy as np
import matplotlib.pyplot as plt


# Create a sigmoid model, first deterministic, then probabilistic
model = Sequential ([
    Dense (
        input_shape = (1,), units = 1, activation = 'sigmoid',
        kernel_initializer = tf.constant_initializer(1),
        bias_initializer = tf.constant_initializer(0)
    ),
    tfpl.DistributionLambda (
        lambda t: tfd.Bernoulli(probs = t),
        convert_to_tensor_fn = tfd.Distribution.sample
    )
])

# Plot the function
x_plot = np.linspace(-5, 5, 100)
plt.scatter(x_plot, model.predict(x_plot), alpha = 0.4)
plt.plot(x_plot, 1 / (1 + np.exp(-x_plot)), color = 'r', alpha = 0.8)
plt.show()


# Create a constant input for this model
x = np.array([[0]])
x

array([[0]])


# Explore the feedforward object...
y_model = model(x)
y_model

<tfp.distributions.Bernoulli 'sequential_distribution_lambda_Bernoulli' batch_shape=[1, 1] event_shape=[] dtype=int32>


# ... and its behaviour under repeated calls
for _ in range(5):
    print(model.predict(x))

[[0]]
[[0]]
[[0]]
[[1]]
[[0]]


# Use the model to create 500 training points
x_train = np.linspace(-5, 5, 500)[:, np.newaxis]
y_train = model.predict(x_train)

# Plot the data and the mean of the distribution
fig, ax = plt.subplots(figsize = (5, 5))
ax.scatter(x_train, y_train, alpha = 0.04, color = 'blue', label = 'samples')
ax.plot(x_train, model(x_train).mean().numpy().flatten(), 
        color = 'red', alpha = 0.8, label = 'mean')
ax.legend()
plt.show()


# Create a new version of the model, with the wrong weights
model_untrained = Sequential ([
    Dense (
        input_shape = (1,), units = 1, activation = 'sigmoid',
        kernel_initializer = tf.constant_initializer(2),
        bias_initializer = tf.constant_initializer(2)
    ),
    tfpl.DistributionLambda (
        lambda t: tfd.Bernoulli(probs = t),
        convert_to_tensor_fn = tfd.Distribution.sample
    )
])


# Define negative loglikelihood, which we will use for training
def nll(y_true, y_pred):
    return -y_pred.log_prob(y_true)


# Compile untrained model
model_untrained.compile(loss = nll, optimizer = RMSprop(learning_rate = 0.01))


# Train model, record weights after each epoch
epochs = [0]
training_weights = [model_untrained.weights[0].numpy()[0, 0]]
training_bias = [model_untrained.weights[1].numpy()[0]]
for epoch in range(100):
    model_untrained.fit(x = x_train, y = y_train, epochs = 1, verbose = False)
    epochs.append(epoch)
    training_weights.append(model_untrained.weights[0].numpy()[0, 0])
    training_bias.append(model_untrained.weights[1].numpy()[0])


# Plot the model weights as they train, converging to the correct values
plt.plot(epochs, training_weights, label = 'weight')
plt.plot(epochs, training_bias, label = 'bias')
plt.axhline(y = 1, label = 'true_weight', color = 'k', linestyle = ':')
plt.axhline(y = 0, label = 'true_bias', color = 'k', linestyle = '--')
plt.xlabel('Epochs')
plt.legend()
plt.show()


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import RMSprop
import numpy as np
import matplotlib.pyplot as plt


# Create and plot 100 points of training data
x_train = np.linspace(-1, 1, 100)[:, np.newaxis]
y_train = x_train + 0.3 * np.random.randn(100)[:, np.newaxis]

plt.scatter(x_train, y_train, alpha = 0.4)
plt.xlabel('x')
plt.ylabel('y')
plt.show()


# Create and train deterministic linear model using mean squared error loss

# Create linear regression via Sequential model
model = Sequential([Dense(units = 1, input_shape = (1,))])
model.compile(loss = MeanSquaredError(), optimizer = RMSprop(learning_rate = 0.005))
model.summary()
model.fit(x_train, y_train, epochs = 200, verbose = False)

# Plot the data and model
plt.scatter(x_train, y_train, alpha = 0.4, label = 'data')
plt.plot(x_train, model.predict(x_train), color = 'red', alpha = 0.8, label = 'model')
plt.legend()
plt.show()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_2 (Dense)              (None, 1)                 2         
=================================================================
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________


# Examine the model predictions
x = np.array([[0]])
y_model = model(x)
y_model

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.05189721]], dtype=float32)>


# Create probabilistic regression with normal distribution as final layer
event_shape = 1
model = Sequential ([
    Dense(units = tfpl.IndependentNormal.params_size(event_shape), input_shape = (1,)),
    tfpl.IndependentNormal(event_shape)
])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_3 (Dense)              (None, 2)                 4         
_________________________________________________________________
independent_normal (Independ ((None, 1), (None, 1))    0         
=================================================================
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________


# Train model using the negative loglikelihood
def nll(y_true, y_pred):
    return -y_pred.log_prob(y_true)

model.compile(loss = nll, optimizer = RMSprop(learning_rate = 0.005))
model.fit(x_train, y_train, epochs = 200, verbose = False)

<tensorflow.python.keras.callbacks.History at 0x7fb3f45d8e50>


# Examine the distribution created as a feedforward value
y_model = model(x)
y_model

<tfp.distributions.Independent 'sequential_3_independent_normal_IndependentNormal_Independentsequential_3_independent_normal_IndependentNormal_Normal' batch_shape=[1] event_shape=[1] dtype=float32>


# Plot the data and a sample from the model
y_model = model(x_train)
y_sample = y_model.sample()
y_hat = y_model.mean()
y_sd = y_model.stddev()
y_hat_m2sd = y_hat - 2 * y_sd
y_hat_p2sd = y_hat + 2 * y_sd

fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 5), sharey = True)
ax1.scatter(x_train, y_train, alpha = 0.4, label = 'data')
ax1.scatter(x_train, y_sample, alpha = 0.4, color = 'red', label = 'model sample')
ax1.legend()
ax2.scatter(x_train, y_train, alpha = 0.4, label = 'data')
ax2.plot(x_train, y_hat, color = 'red', alpha = 0.8, label = 'model $\mu$')
ax2.plot(x_train, y_hat_m2sd, color = 'green', alpha = 0.8, label = 'model $\mu \pm 2 \sigma$')
ax2.plot(x_train, y_hat_p2sd, color = 'green', alpha = 0.8)
ax2.legend()
plt.show()


# Create and plot 10000 data points
x_train = np.linspace(-1, 1, 1000)[:, np.newaxis]
y_train = np.power(x_train, 3) + 0.1 * (2 + x_train) * np.random.randn(1000)[:, np.newaxis]

plt.scatter(x_train, y_train, alpha = 0.1)
plt.show()


# Create probabilistic regression: normal distribution with fixed variance
model = Sequential ([
    Dense(input_shape = (1,), units = 8, activation = 'sigmoid'),
    Dense(tfpl.IndependentNormal.params_size(event_shape = 1)),
    tfpl.IndependentNormal(event_shape = 1)
])
model.compile(loss = nll, optimizer = RMSprop(learning_rate = 0.01))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_4 (Dense)              (None, 8)                 16        
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 18        
_________________________________________________________________
independent_normal_1 (Indepe ((None, 1), (None, 1))    0         
=================================================================
Total params: 34
Trainable params: 34
Non-trainable params: 0
_________________________________________________________________


# Train model
model.fit(x_train, y_train, epochs = 200, verbose = False)
model.evaluate(x_train, y_train)

32/32 [==============================] - 0s 3ms/step - loss: 0.0217

0.021747954189777374


# Plot the data and a sample from the model
y_model = model(x_train)
y_sample = y_model.sample()
y_hat = y_model.mean()
y_sd = y_model.stddev()
y_hat_m2sd = y_hat - 2 * y_sd
y_hat_p2sd = y_hat + 2 * y_sd

fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 5), sharey=True)
ax1.scatter(x_train, y_train, alpha = 0.2, label = 'data')
ax1.scatter(x_train, y_sample, alpha = 0.2, color = 'red', label = 'model sample')
ax1.legend()
ax2.scatter(x_train, y_train, alpha = 0.2, label = 'data')
ax2.plot(x_train, y_hat, color = 'red', alpha = 0.8, label = 'model $\mu$')
ax2.plot(x_train, y_hat_m2sd, color = 'green', alpha = 0.8, label = 'model $\mu \pm 2 \sigma$')
ax2.plot(x_train, y_hat_p2sd, color = 'green', alpha = 0.8)
ax2.legend()
plt.show()


from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import RMSprop
import numpy as np
import matplotlib.pyplot as plt


# Use the same data as before -- create and plot 100 data points
x_train = np.linspace(-1, 1, 100)[:, np.newaxis]
y_train = x_train + 0.3 * np.random.randn(100)[:, np.newaxis]

plt.scatter(x_train, y_train, alpha = 0.4)
plt.show()


# Define the prior weight distribution -- all N(0, 1) -- and not trainable
def prior(kernel_size, bias_size, dtype = None):
    n = kernel_size + bias_size
    prior_model = Sequential ([
        tfpl.DistributionLambda \
            (lambda t: tfd.MultivariateNormalDiag(loc = tf.zeros(n), scale_diag = tf.ones(n)))
    ])
    return prior_model


# Define variational posterior weight distribution -- multivariate Gaussian
def posterior(kernel_size, bias_size, dtype = None):
    n = kernel_size + bias_size
    posterior_model = Sequential ([
        tfpl.VariableLayer(tfpl.MultivariateNormalTriL.params_size(n), dtype = dtype),
        tfpl.MultivariateNormalTriL(n)
    ])
    return posterior_model


# Create linear regression model with weight uncertainty: weights are
# distributed according to posterior (and, indirectly, prior) distribution
model = Sequential ([
    tfpl.DenseVariational (
        input_shape = (1,), units = 1, 
        make_prior_fn = prior, make_posterior_fn = posterior,
        kl_weight = 1 / x_train.shape[0], kl_use_exact = True
    )
])
model.compile(loss = MeanSquaredError(), optimizer = RMSprop(learning_rate = 0.005))
model.summary()

WARNING:tensorflow:From /home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py:158: calling LinearOperator.__init__ (from tensorflow.python.ops.linalg.linear_operator) with graph_parents is deprecated and will be removed in a future version.
Instructions for updating:
Do not pass `graph_parents`.  They will  no longer be used.
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_variational (DenseVari (None, 1)                 5         
=================================================================
Total params: 5
Trainable params: 5
Non-trainable params: 0
_________________________________________________________________


# Fit the model, just like a deterministic linear regression
model.fit(x_train, y_train, epochs = 500, verbose = False)

<tensorflow.python.keras.callbacks.History at 0x7fb3f42e2550>


# Check out the parameters of the prior and posterior distribution
dummy_input = np.array([[0]])
model_prior = model.layers[0]._prior(dummy_input)
model_posterior = model.layers[0]._posterior(dummy_input)
print('prior mean:           ', model_prior.mean().numpy())
print('prior variance:       ', model_prior.variance().numpy())
print('posterior mean:       ', model_posterior.mean().numpy())
print('posterior covariance: ', model_posterior.covariance().numpy()[0])
print('                      ', model_posterior.covariance().numpy()[1])

prior mean:            [0. 0.]
prior variance:        [1. 1.]
posterior mean:        [ 1.0078331 -0.0018866]
posterior covariance:  [0.01788209 0.00079696]
                       [0.00079696 0.00576819]


# Plot an ensemble of linear regressions, with weights sampled from
# the posterior distribution
plt.scatter(x_train, y_train, alpha = 0.4, label = 'data')
for _ in range(10):
    y_model = model(x_train)
    if _ == 0:
        plt.plot(x_train, y_model, color = 'red', alpha = 0.8, label = 'model')
    else:
        plt.plot(x_train, y_model, color = 'red', alpha = 0.8)        
plt.legend()
plt.show()


# Create two datasets, one with 1000 points, another with 100
x_train_1000 = np.linspace(-1, 1, 1000)[:, np.newaxis]
y_train_1000 = x_train_1000 + 0.3 * np.random.randn(1000)[:, np.newaxis]

x_train_100 = np.linspace(-1, 1, 100)[:, np.newaxis]
y_train_100 = x_train_100 + 0.3 * np.random.randn(100)[:, np.newaxis]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (10, 4), sharex = True, sharey = True)
ax1.scatter(x_train_1000, y_train_1000, alpha = 0.1)
ax2.scatter(x_train_100, y_train_100, alpha = 0.4)
plt.show()


# Train a model on each dataset

model_1000 = Sequential ([
    tfpl.DenseVariational (
        input_shape = (1,), units = 1,
        make_prior_fn = prior, make_posterior_fn = posterior, kl_weight = 1 / 1000
    )
])

model_100 = Sequential ([
    tfpl.DenseVariational (
        input_shape = (1,), units = 1,
        make_prior_fn = prior, make_posterior_fn = posterior, kl_weight = 1 / 100
    )
])

model_1000.compile(loss = MeanSquaredError(), optimizer = RMSprop(learning_rate = 0.005))
model_100.compile(loss = MeanSquaredError(), optimizer = RMSprop(learning_rate = 0.005))

model_1000.fit(x_train_1000, y_train_1000, epochs = 50, verbose = False)
model_100.fit(x_train_100, y_train_100, epochs = 500, verbose = False)

<tensorflow.python.keras.callbacks.History at 0x7fb3d85f2b10>


# Plot an ensemble of linear regressions from each model
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (10, 4), sharex = True, sharey = True)
for _ in range(10):
    y_model_1000 = model_1000(x_train_1000)
    ax1.scatter(x_train_1000, y_train_1000, color = 'C0', alpha = 0.02)
    ax1.plot(x_train_1000, y_model_1000, color = 'red', alpha = 0.8)
    y_model_100 = model_100(x_train_100)
    ax2.scatter(x_train_100, y_train_100, color = 'C0', alpha = 0.05)
    ax2.plot(x_train_100, y_model_100, color = 'red', alpha = 0.8)
plt.show()


# Create and plot 1000 data points
x_train = np.linspace(-1, 1, 1000)[:, np.newaxis]
y_train = np.power(x_train, 3) + 0.1 * (2 + x_train) * np.random.randn(1000)[:, np.newaxis]

plt.scatter(x_train, y_train, alpha = 0.1)
plt.show()


# Create probabilistic regression with one hidden layer, weight uncertainty
model = Sequential ([
    tfpl.DenseVariational (
        units = 8, input_shape = (1,),
        make_prior_fn = prior, make_posterior_fn = posterior,
        kl_weight = 1 / x_train.shape[0], activation = 'sigmoid'
    ),
    tfpl.DenseVariational (
        units = tfpl.IndependentNormal.params_size(1),
        make_prior_fn = prior, make_posterior_fn = posterior,
        kl_weight = 1 / x_train.shape[0]
    ),
    tfpl.IndependentNormal(1)
])

def nll(y_true, y_pred):
    return -y_pred.log_prob(y_true)

model.compile(loss = nll, optimizer = RMSprop(learning_rate = 0.005))
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_variational_3 (DenseVa (None, 8)                 152       
_________________________________________________________________
dense_variational_4 (DenseVa (None, 2)                 189       
_________________________________________________________________
independent_normal_2 (Indepe ((None, 1), (None, 1))    0         
=================================================================
Total params: 341
Trainable params: 341
Non-trainable params: 0
_________________________________________________________________


# Train the model
model.fit(x_train, y_train, epochs = 1000, verbose = False)
model.evaluate(x_train, y_train)

32/32 [==============================] - 0s 4ms/step - loss: -0.0065

-0.00650815200060606


# Plot an ensemble of trained probabilistic regressions
plt.scatter(x_train, y_train, marker = '.', alpha = 0.2, label = 'data')
for _ in range(5):
    y_model = model(x_train)
    y_hat = y_model.mean()
    y_hat_m2sd = y_hat - 2 * y_model.stddev()
    y_hat_p2sd = y_hat + 2 * y_model.stddev()
    if _ == 0:
        plt.plot(x_train, y_hat, color = 'red', alpha = 0.8, label = 'model $\mu$')
        plt.plot(x_train, y_hat_m2sd, color = 'green', alpha = 0.8, label = 'model $\mu \pm 2 \sigma$')
        plt.plot(x_train, y_hat_p2sd, color = 'green', alpha = 0.8)
    else:
        plt.plot(x_train, y_hat, color = 'red', alpha = 0.8)
        plt.plot(x_train, y_hat_m2sd, color = 'green', alpha = 0.8)
        plt.plot(x_train, y_hat_p2sd, color = 'green', alpha = 0.8)        
plt.legend()
plt.show()


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import RMSprop
import os
import numpy as np
import matplotlib.pyplot as plt


# Load the HAR dataset and create some data processing functions

# Function to load the data from file
def load_HAR_data():
    data_dir = 'data/HAR/'
    x_train = np.load(os.path.join(data_dir, 'x_train.npy'))[..., :6]
    y_train = np.load(os.path.join(data_dir, 'y_train.npy')) - 1
    x_test  = np.load(os.path.join(data_dir, 'x_test.npy'))[..., :6]
    y_test  = np.load(os.path.join(data_dir, 'y_test.npy')) - 1
    return (x_train, y_train), (x_test, y_test)

# Dictionary containing the labels and the associated activities
label_to_activity = {
    0: 'walking horizontally', 1: 'walking upstairs', 2: 'walking downstairs',
    3: 'sitting', 4: 'standing', 5: 'laying'
}

# Function to change integer labels to one-hot labels
def integer_to_onehot(data_integer):
    data_onehot = np.zeros(shape=(data_integer.shape[0], data_integer.max() + 1))
    for row in range(data_integer.shape[0]):
        integer = int(data_integer[row])
        data_onehot[row, integer] = 1
    return data_onehot

# Load the data
(x_train, y_train), (x_test, y_test) = load_HAR_data()
y_train_oh = integer_to_onehot(y_train)
y_test_oh = integer_to_onehot(y_test)


# Inspect some of the data by making plots
def make_plots(num_examples_per_category):
    for label in range(6):
        x_label = x_train[y_train[:, 0] == label]
        for i in range(num_examples_per_category):
            fig, ax = plt.subplots(figsize = (10, 1))
            ax.imshow(x_label[100 * i].T, cmap = 'Greys', vmin = -1, vmax = 1)
            ax.axis('off')
            if i == 0:
                ax.set_title(label_to_activity[label])
            plt.show()
        
make_plots(1)


# Create standard deterministic model with:
# - Conv1D
# - MaxPooling
# - Flatten
# - Dense with Softmax
model = Sequential ([
    Conv1D(input_shape = (128, 6), filters = 8, kernel_size = 16, activation = 'relu'),
    MaxPooling1D(pool_size = 16),
    Flatten(),
    Dense(units = 6, activation = 'softmax')
])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv1d (Conv1D)              (None, 113, 8)            776       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 7, 8)              0         
_________________________________________________________________
flatten (Flatten)            (None, 56)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 342       
=================================================================
Total params: 1,118
Trainable params: 1,118
Non-trainable params: 0
_________________________________________________________________


# Create probablistic model with the following layers:
#  - Conv1D
#  - MaxPooling
#  - Flatten
#  - Dense
#  - OneHotCategorical
divergence_fn = lambda q, p, _: tfd.kl_divergence(q, p) / x_train.shape[0]
model = Sequential ([
    tfpl.Convolution1DReparameterization (
        input_shape = (128, 6), filters = 8, kernel_size = 16, activation = 'relu',
        kernel_prior_fn = tfpl.default_multivariate_normal_fn,
        kernel_posterior_fn = tfpl.default_mean_field_normal_fn(is_singular = False),
        kernel_divergence_fn = divergence_fn,
        bias_prior_fn = tfpl.default_multivariate_normal_fn,
        bias_posterior_fn = tfpl.default_mean_field_normal_fn(is_singular = False),
        bias_divergence_fn = divergence_fn,
    ),
    MaxPooling1D(pool_size = 16),
    Flatten(),
    tfpl.DenseReparameterization (
        units = tfpl.OneHotCategorical.params_size(6), activation = None,
        kernel_prior_fn = tfpl.default_multivariate_normal_fn,
        kernel_posterior_fn = tfpl.default_mean_field_normal_fn(is_singular = False),
        kernel_divergence_fn = divergence_fn,
        bias_prior_fn = tfpl.default_multivariate_normal_fn,
        bias_posterior_fn = tfpl.default_mean_field_normal_fn(is_singular = False),
        bias_divergence_fn = divergence_fn,
    ),
    tfpl.OneHotCategorical(6)
])
model.summary()

WARNING:tensorflow:From /home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/tensorflow_probability/python/layers/util.py:106: Layer.add_variable (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.add_weight` method instead.
Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv1d_reparameterization (C (None, 113, 8)            1552      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 7, 8)              0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 56)                0         
_________________________________________________________________
dense_reparameterization (De (None, 6)                 684       
_________________________________________________________________
one_hot_categorical (OneHotC ((None, 6), (None, 6))    0         
=================================================================
Total params: 2,236
Trainable params: 2,236
Non-trainable params: 0
_________________________________________________________________


# Replace analytical Kullback-Leibler divergence with approximated one
def kl_approx(q, p, q_tensor):
    return tf.reduce_mean(q.log_prob(q_tensor) - p.log_prob(q_tensor))

divergence_fn = lambda q, p, q_tensor : kl_approx(q, p, q_tensor) / x_train.shape[0]


# Compile the model using the negative loglikelihood
def nll(y_true, y_pred):
    return -y_pred.log_prob(y_true)

model.compile (
    loss = nll, optimizer = RMSprop(learning_rate = 0.005),
    metrics = ['accuracy'], experimental_run_tf_function = False
)


# Train the model
model.fit(x_train, y_train_oh, epochs = 20, verbose = False)
model.evaluate(x_train, y_train_oh)
model.evaluate(x_test, y_test_oh)

230/230 [==============================] - 2s 9ms/step - loss: 0.6452 - accuracy: 0.7514
93/93 [==============================] - 1s 9ms/step - loss: 1.0390 - accuracy: 0.7241

[1.0390419960021973, 0.7241262197494507]


# Define function to analyse model predictions versus true labels
def analyse_model_predictions(image_num):

    # Show the accelerometer data
    print('------------------------------')
    print('Accelerometer data:')
    fig, ax = plt.subplots(figsize = (10, 1))
    ax.imshow(x_test[image_num].T, cmap = 'Greys', vmin = -1, vmax = 1)
    ax.axis('off')
    plt.show()

    # Print the true activity
    print('------------------------------')
    print('True activity:', label_to_activity[y_test[image_num, 0]])
    print('')

    # Print the probabilities the model assigns
    print('------------------------------')
    print('Model estimated probabilities:')
    # Create ensemble of predicted probabilities
    predicted_probabilities = np.empty(shape = (200, 6))
    for i in range(200):
        predicted_probabilities[i] = model(x_test[image_num][np.newaxis, ...]).mean().numpy()[0]
    pct_2p5 = np.array([np.percentile(predicted_probabilities[:, i], 2.5) for i in range(6)])
    pct_97p5 = np.array([np.percentile(predicted_probabilities[:, i], 97.5) for i in range(6)])
    # Make the plots
    fig, ax = plt.subplots(figsize = (9, 3))
    bar = ax.bar(np.arange(6), pct_97p5, color = 'red')
    bar[y_test[image_num, 0]].set_color('green')
    bar = ax.bar(np.arange(6), pct_2p5 - 0.02, color = 'white', linewidth = 1, edgecolor = 'white')
    ax.set_xticklabels (
        [''] + [activity for activity in label_to_activity.values()],
        rotation = 45,
        horizontalalignment = 'right'
    )
    ax.set_ylim([0, 1])
    ax.set_ylabel('Probability')
    plt.show()


analyse_model_predictions(image_num = 79)

------------------------------
Accelerometer data:

------------------------------
True activity: walking horizontally

------------------------------
Model estimated probabilities:

/home/bacti/.local/lib/python3.7/site-packages/ipykernel_launcher.py:34: UserWarning: FixedFormatter should only be used together with FixedLocator


analyse_model_predictions(image_num = 633)

------------------------------
Accelerometer data:

------------------------------
True activity: standing

------------------------------
Model estimated probabilities:

/home/bacti/.local/lib/python3.7/site-packages/ipykernel_launcher.py:34: UserWarning: FixedFormatter should only be used together with FixedLocator


analyse_model_predictions(image_num = 1137)

------------------------------
Accelerometer data:

------------------------------
True activity: walking horizontally

------------------------------
Model estimated probabilities:

/home/bacti/.local/lib/python3.7/site-packages/ipykernel_launcher.py:34: UserWarning: FixedFormatter should only be used together with FixedLocator


import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors
print("TF version:", tf.__version__)
print("TFP version:", tfp.__version__)

TF version: 2.3.0
TFP version: 0.11.0


# Create the base distribution and a single sample
uniform = tfd.Uniform \
    (low = [0.0, 0.0], high = [1.0, 1.0], name = 'uniform2d')
x = uniform.sample()
x

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.79194343, 0.41313922], dtype=float32)>


# Create the ScaleMatvecDiag bijector
bijector = tfb.ScaleMatvecDiag(scale_diag = [1.5, -0.5])


# Apply the bijector to the sample point
y = bijector(x)
y

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 1.1879151 , -0.20656961], dtype=float32)>


# Create the ScaleMatvecTriL bijector
bijector = tfb.ScaleMatvecTriL(scale_tril = [[-1., 0.], [-1., -1.]])


# Apply the bijector to the sample x
y = bijector(x)
y

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([-0.79194343, -1.2050827 ], dtype=float32)>


# Create a scale and shift bijector
scale_bijector = tfb.ScaleMatvecTriL \
    (scale_tril = [[-1., 0.], [-1., -1.]])
shift_bijector = tfb.Shift([-1., 0.])
bijector = shift_bijector(scale_bijector)


# Apply the bijector to the sample x
y = bijector(x)
y

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([-1.7919434, -1.2050827], dtype=float32)>


# Apply the inverse transformation to the image of x
bijector = tfb.ScaleMatvecTriL \
    (scale_tril = [[-1., 0.], [-1., -1.]])
y = bijector.inverse(bijector(x))


# Check that all y and x values are the same
tf.reduce_all(y == x)

<tf.Tensor: shape=(), dtype=bool, numpy=True>


scale = tf.linalg.LinearOperatorDiag(diag = [1.5, -0.5])


# Create the ScaleMatvecLinearOperator bijector
bijector = tfb.ScaleMatvecLinearOperator(scale)


# Apply the bijector to the sample x
y = bijector(x)
y

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 1.1879151 , -0.20656961], dtype=float32)>


# Create a ScaleMatvecLinearOperator bijector
B = [[0.5, 1.5], [1.5, 0.5]]
scale = tf.linalg.LinearOperatorFullMatrix(matrix = B)
bijector = tfb.ScaleMatvecLinearOperator(scale)


# Apply the bijector to the sample x
y = bijector(x)
y

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.0156806, 1.3944848], dtype=float32)>


# Create 10 samples from the uniform distribution
x = uniform.sample(10)
x

<tf.Tensor: shape=(10, 2), dtype=float32, numpy=
array([[0.6960416 , 0.00880921],
       [0.48266065, 0.3516494 ],
       [0.06291747, 0.3056345 ],
       [0.37396407, 0.43012333],
       [0.14035296, 0.29497242],
       [0.19567811, 0.7274327 ],
       [0.69855285, 0.28542566],
       [0.5855237 , 0.06669152],
       [0.53690004, 0.43457592],
       [0.34628356, 0.70255816]], dtype=float32)>


# Recreate the diagonal matrix transformation with LinearOperatorDiag
scale = tf.linalg.LinearOperatorDiag(diag = [1.5, -0.5])
scale.to_dense()

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[ 1.5,  0. ],
       [ 0. , -0.5]], dtype=float32)>


# Create the ScaleMatvecLinearOperator bijector
bijector = tfb.ScaleMatvecLinearOperator(scale)


# Apply the bijector to the 10 samples
y = bijector(x)
y

<tf.Tensor: shape=(10, 2), dtype=float32, numpy=
array([[ 1.0440624 , -0.0044046 ],
       [ 0.723991  , -0.1758247 ],
       [ 0.09437621, -0.15281725],
       [ 0.5609461 , -0.21506166],
       [ 0.21052945, -0.14748621],
       [ 0.29351717, -0.36371636],
       [ 1.0478293 , -0.14271283],
       [ 0.8782856 , -0.03334576],
       [ 0.80535007, -0.21728796],
       [ 0.51942533, -0.35127908]], dtype=float32)>


# Create a batched ScaleMatvecLinearOperator bijector
diag = tf.stack \
    ((tf.constant([1, -1.]), tf.constant([-1, 1.])))  # (2, 2)
scale = tf.linalg.LinearOperatorDiag(diag = diag)  # (2, 2, 2)
bijector = tfb.ScaleMatvecLinearOperator(scale = scale)


# Add a singleton batch dimension to x
x = tf.expand_dims(x, axis = 1)
x.shape

TensorShape([10, 1, 2])


# Apply the batched bijector to x
y = bijector(x)
y.shape  # (S, B, E) shape semantics

TensorShape([10, 2, 2])


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors
tfpl = tfp.layers


# Define base distribution
normal = tfd.Normal(loc = 0., scale = 1.)


# Sample from base distribution
n = 10000
z = normal.sample(n)


# Define scale and shift
scale = 4.5
shift = 7


# Define chain bijector
scale_and_shift = tfb.Chain([tfb.Shift(shift), tfb.Scale(scale)])


# We can also use call methods
scale_transf = tfb.Scale(scale)
shift_transf = tfb.Shift(shift)
scale_and_shift = shift_transf(scale_transf)


# Apply the forward transformation
x = scale_and_shift.forward(z)


# Check the forward transformation
tf.norm(x - (scale * z + shift))

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>


# Plot z density
plt.hist(z.numpy(), bins = 60, density = True)
plt.show()


# Plot x density
plt.hist(x.numpy(), bins = 60, density = True)
plt.show()


# Apply inverse transformation
inv_x = scale_and_shift.inverse(x)


# Check inverse transformation
tf.norm(inv_x - z)

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>


# Compute log prob for x
log_prob_x = normal.log_prob(z) - scale_and_shift.forward_log_det_jacobian(z, event_ndims = 0)
log_prob_x

<tf.Tensor: shape=(10000,), dtype=float32, numpy=
array([-3.2562919, -5.2344127, -2.4325273, ..., -2.5546515, -2.4498444,
       -2.755146 ], dtype=float32)>


# We can also use the inverse transformation
log_prob_x = normal.log_prob(scale_and_shift.inverse(x)) + \
    scale_and_shift.inverse_log_det_jacobian(x, event_ndims = 0)
log_prob_x

<tf.Tensor: shape=(10000,), dtype=float32, numpy=
array([-3.2562919, -5.2344127, -2.4325273, ..., -2.5546515, -2.4498444,
       -2.755146 ], dtype=float32)>


x = tf.random.normal(shape = (100, 1))


# Softfloor bijector
softfloor = tfb.Softfloor(temperature = 0.01)
y = softfloor.forward(x)
y.shape

TensorShape([100, 1])


# Softfloor bijector using broadcasting
softfloor = tfb.Softfloor(temperature = [0.2, 1.])
y = softfloor.forward(x)
y.shape

TensorShape([100, 2])


# Softfloor bijector using broadcasting
softfloor = tfb.Softfloor(temperature = [0.01, 0.1, 1.])


# Plot routine
def _plot(nparams, bijector, params, x):
    bijector_params = tuple(getattr(bijector, name) for name in params)
    upper_params = [name[0].upper() + name[1:] for name in params]
    fig = plt.figure(figsize=(14, 5))
    lines = plt.plot(np.tile(x, nparams), bijector.forward(x))
    for l in zip(lines, *bijector_params):
        labels = ": {:.2f}, ".join(upper_params) + ': {:.2f}'
        l[0].set_label(labels.format(*l[1:]))
    plt.legend()
    plt.show()


# Plot
x = np.linspace(-2, 2, 2000)[..., np.newaxis]
_plot(3, softfloor, ['temperature'], x)


# Gumbel bijector using broadcasting
exps = tfb.GumbelCDF(loc = [0.5, 1, 1.5, 2, 3], scale = [1, 2, 2, 3, 4])


# Plot
x = np.linspace(-10, 10, 2000, dtype = np.float32)[..., np.newaxis]
_plot(5, exps, ['loc', 'scale'], x)


# Parameters
n = 10000
loc = 0
scale = 0.5


# Normal distribution
normal = tfd.Normal(loc = loc, scale = scale)


# Display event and batch shape
print('batch shape: ', normal.batch_shape)
print('event shape: ', normal.event_shape)

batch shape:  ()
event shape:  ()


# Exponential bijector
exp = tfb.Exp()


# Log normal transformed distribution using exp and normal bijectors
log_normal_td = exp(normal)


# Display event and batch shape
print('batch shape: ', log_normal_td.batch_shape)
print('event shape: ', log_normal_td.event_shape)

batch shape:  ()
event shape:  ()


# Base distribution
z = normal.sample(n)


# Plot z density
plt.hist(z.numpy(), bins = 100, density = True)
plt.show()


# Transformed distribution
x = log_normal_td.sample(n)


# Plot x density
plt.hist(x.numpy(), bins = 100, density = True)
plt.show()


# Define log normal distribution
log_normal = tfd.LogNormal(loc = loc, scale = scale)


# Sample log_normal
l = log_normal.sample(n)


# Plot l density
plt.hist(l.numpy(), bins = 100, density = True)
plt.show()


# Log prob of LogNormal
log_prob = log_normal.log_prob(x)


# Log prob of log normal transformed distribution
log_prob_td = log_normal_td.log_prob(x)


# Check log probs
tf.norm(log_prob - log_prob_td)

<tf.Tensor: shape=(), dtype=float32, numpy=1.01100395e-05>


# Set a scaling lower triangular matrix
tril = tf.random.normal((2, 4, 4))
scale_low_tri = tf.linalg.LinearOperatorLowerTriangular(tril)


# View of scale_low_tri
scale_low_tri.to_dense()

<tf.Tensor: shape=(2, 4, 4), dtype=float32, numpy=
array([[[-0.44597602,  0.        ,  0.        ,  0.        ],
        [ 0.05159349, -0.09687567,  0.        ,  0.        ],
        [-0.46078765,  0.0207627 , -0.27233383,  0.        ],
        [-1.1664037 , -1.2637694 ,  0.5337775 ,  1.393418  ]],

       [[ 0.6948613 ,  0.        ,  0.        ,  0.        ],
        [ 1.1749753 ,  0.4086611 ,  0.        ,  0.        ],
        [-0.08414282,  0.4017596 , -0.17369917,  0.        ],
        [-0.4325606 , -0.18971127, -0.43916336, -1.9362649 ]]],
      dtype=float32)>


# Define scale linear operator
scale_lin_op = tfb.ScaleMatvecLinearOperator(scale_low_tri)


# Define scale linear operator transformed distribution with a batch and event shape
mvn = tfd.TransformedDistribution(normal, scale_lin_op, batch_shape = [2], event_shape = [4])
mvn

<tfp.distributions.TransformedDistribution 'scale_matvec_linear_operatorNormal' batch_shape=[2] event_shape=[4] dtype=float32>


# Display event and batch shape
print('batch shape: ', mvn.batch_shape)
print('event shape: ', mvn.event_shape)

batch shape:  (2,)
event shape:  (4,)


# Sample
y1 = mvn.sample(sample_shape = (n,))
y1.shape

TensorShape([10000, 2, 4])


# Define a MultivariateNormalLinearOperator distribution
mvn2 = tfd.MultivariateNormalLinearOperator(loc = 0, scale = scale_low_tri)
mvn2

<tfp.distributions.MultivariateNormalLinearOperator 'MultivariateNormalLinearOperator' batch_shape=[2] event_shape=[4] dtype=float32>


# Display event and batch shape
print('batch shape: ', mvn2.batch_shape)
print('event shape: ', mvn2.event_shape)

batch shape:  (2,)
event shape:  (4,)


# Sample
y2 = mvn2.sample(sample_shape = (n,))
y2.shape

TensorShape([10000, 2, 4])


# Check
xn = normal.sample((n, 2, 4))
tf.norm(mvn.log_prob(xn) - mvn2.log_prob(xn)) / tf.norm(mvn.log_prob(xn))

<tf.Tensor: shape=(), dtype=float32, numpy=0.74677587>


# Define a new bijector: Cubic
class Cubic(tfb.Bijector):

    def __init__(self, a, b, validate_args = False, name = 'Cubic'):
        self.a = tf.cast(a, tf.float32)
        self.b = tf.cast(b, tf.float32)
        if validate_args:
            assert tf.reduce_mean(tf.cast(tf.math.greater_equal(tf.abs(self.a), 1e-5), tf.float32)) == 1.0
            assert tf.reduce_mean(tf.cast(tf.math.greater_equal(tf.abs(self.b), 1e-5), tf.float32)) == 1.0
        super(Cubic, self).__init__ \
            (validate_args = validate_args, forward_min_event_ndims = 0, name = name)

    def _forward(self, x):
        x = tf.cast(x, tf.float32)
        return tf.squeeze(tf.pow(self.a * x + self.b, 3))

    def _inverse(self, y):
        y = tf.cast(y, tf.float32)
        return (tf.math.sign(y) * tf.pow(tf.abs(y), 1 / 3) - self.b) / self.a

    def _forward_log_det_jacobian(self, x):
        x = tf.cast(x, tf.float32)
        return tf.math.log(3. * tf.abs(self.a)) + 2. * tf.math.log(tf.abs(self.a * x + self.b))


# Cubic bijector
cubic = Cubic([1., -2.], [-1., 0.4], validate_args = True)


# Apply forward transformation
x = tf.constant([[1, 2], [3, 4]])
y = cubic.forward(x)
y

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[   0.     ,  -46.656  ],
       [   8.     , -438.97598]], dtype=float32)>


# Check inverse
np.linalg.norm(x - cubic.inverse(y))

0.0


# Plot the forward transformation
x = np.linspace(-10, 10, 500).reshape(-1, 1)
plt.plot(x, cubic.forward(x))

[<matplotlib.lines.Line2D at 0x7f1f30280210>,
 <matplotlib.lines.Line2D at 0x7f1f3020b810>]


# Display shape
cubic.forward(x).shape

TensorShape([500, 2])


# Plot the inverse
plt.plot(x, cubic.inverse(x))

[<matplotlib.lines.Line2D at 0x7f1f302e1b50>,
 <matplotlib.lines.Line2D at 0x7f1f3021c510>]


# Plot the forward log Jacobian determinant
plt.plot(x, cubic.forward_log_det_jacobian(x, event_ndims = 0))

[<matplotlib.lines.Line2D at 0x7f1f303d7710>,
 <matplotlib.lines.Line2D at 0x7f1f302f9210>]


# Plot the inverse log Jacobian determinant
plt.plot(x, cubic.inverse_log_det_jacobian(x, event_ndims = 0))

[<matplotlib.lines.Line2D at 0x7f1f30576dd0>,
 <matplotlib.lines.Line2D at 0x7f1f305cff50>]


# Create a transformed distribution with Cubic
normal = tfd.Normal(loc = 0., scale = 1.)
cubed_normal = tfd.TransformedDistribution(normal, cubic, event_shape = [2])


# Sample cubed_normal
n = 1000
g = cubed_normal.sample(n)
g.shape

TensorShape([1000, 2])


# Plot histograms
plt.figure(figsize = (12, 4))
plt.subplot(1, 2, 1)
plt.hist(g[..., 0].numpy(), bins = 50, density = True)
plt.subplot(1, 2, 2)
plt.hist(g[..., 1].numpy(), bins = 50, density = True)
plt.show()


# Make contour plot
xx = np.linspace(-0.5, 0.5, 100)
yy = np.linspace(-0.5, 0.5, 100)
X, Y = np.meshgrid(xx, yy)

fig, ax = plt.subplots(1, 1)
Z = cubed_normal.prob(np.dstack((X, Y)))
cp = ax.contourf(X, Y, Z)
fig.colorbar(cp) # Add a colorbar to a plot
ax.set_title('Filled Contours Plot')
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()


# Create a transformed distribution with the inverse of Cube
inverse_cubic = tfb.Invert(cubic)
inv_cubed_normal = inverse_cubic(normal, event_shape = [2])


# Samble inv_cubed_normal
n = 1000
g = inv_cubed_normal.sample(n)
g.shape

TensorShape([1000, 2])


# Make contour plot
xx = np.linspace(-3.0, 3.0, 100)
yy = np.linspace(-2.0, 2.0, 100)
X, Y = np.meshgrid(xx, yy)

fig, ax = plt.subplots(1, 1)
Z = inv_cubed_normal.prob(np.dstack((X, Y)))
cp = ax.contourf(X, Y, Z)
fig.colorbar(cp) # Add a colorbar to a plot
ax.set_title('Filled Contours Plot')
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()


# Plot histograms
plt.figure(figsize = (12, 4))
plt.subplot(1, 2, 1)
plt.hist(g[..., 0].numpy(), bins = 50, density = True)
plt.subplot(1, 2, 2)
plt.hist(g[..., 1].numpy(), bins = 50, density = True)
plt.show()


# Create a mixture of four Gaussians
probs = [0.45, 0.55]
mix_gauss = tfd.Mixture (
    cat = tfd.Categorical(probs = probs),
    components = [tfd.Normal(loc = 2.3, scale = 0.4), tfd.Normal(loc = 0.8, scale = 0.4)]
)


# Create the dataset
x_train = mix_gauss.sample(10000)
x_train = tf.data.Dataset.from_tensor_slices(x_train)
x_train = x_train.batch(128)

x_valid = mix_gauss.sample(1000)
x_valid = tf.data.Dataset.from_tensor_slices(x_valid)
x_valid = x_valid.batch(128)

print(x_train.element_spec)
print(x_valid.element_spec)

TensorSpec(shape=(None,), dtype=tf.float32, name=None)
TensorSpec(shape=(None,), dtype=tf.float32, name=None)


# Plot the data distribution
x = np.linspace(-5.0, 5.0, 100)
plt.plot(x, mix_gauss.prob(x))
plt.title('Data distribution')
plt.show()


# Make a trainable bijector
trainable_inv_cubic = tfb.Invert(Cubic(tf.Variable(0.25), tf.Variable(-0.1),))
trainable_inv_cubic.trainable_variables

(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.25>,
 <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.1>)


# Make a trainable transformed distribution
trainable_dist = tfd.TransformedDistribution(normal, trainable_inv_cubic)


# Plot the data and learned distributions
x = np.linspace(-5.0, 5.0, 100)
plt.figure(figsize = (12, 4))
plt.plot(x, mix_gauss.prob(x), label = 'data')
plt.plot(x, trainable_dist.prob(x), label = 'trainable')
plt.title('Data and trainable distribution')
plt.show()


# Train the bijector
num_epochs = 10
opt = tf.keras.optimizers.Adam()
train_losses = []
valid_losses = []

for epoch in range(num_epochs):
    print("Epoch {}...".format(epoch))
    train_loss = tf.keras.metrics.Mean()
    val_loss = tf.keras.metrics.Mean()
    for train_batch in x_train:
        with tf.GradientTape() as tape:
            tape.watch(trainable_inv_cubic.trainable_variables)
            loss = -trainable_dist.log_prob(train_batch)
        train_loss(loss)
        grads = tape.gradient(loss, trainable_inv_cubic.trainable_variables)
        opt.apply_gradients(zip(grads, trainable_inv_cubic.trainable_variables))
    train_losses.append(train_loss.result().numpy())
        
    # Validation
    for valid_batch in x_valid:
        loss = -trainable_dist.log_prob(valid_batch)
        val_loss(loss)
    valid_losses.append(val_loss.result().numpy())

Epoch 0...
Epoch 1...
Epoch 2...
Epoch 3...
Epoch 4...
Epoch 5...
Epoch 6...
Epoch 7...
Epoch 8...
Epoch 9...


# Plot the learning curves
plt.plot(train_losses, label = 'train')
plt.plot(valid_losses, label = 'valid')
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Negative log likelihood")
plt.title("Training and validation loss curves")
plt.show()


# Plot the data and learned distributions
x = np.linspace(-5.0, 5.0, 100)
plt.figure(figsize = (12, 4))
plt.plot(x, mix_gauss.prob(x), label = 'data')
plt.plot(x, trainable_dist.prob(x), label = 'learned')
plt.title('Data and learned distribution')
plt.show()


# Display trainable variables
trainable_inv_cubic.trainable_variables

(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.47675532>,
 <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.04277669>)


# Load dataset
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
n_samples = 1000
noisy_moons = datasets.make_moons(n_samples = n_samples, noise = .05)
X, y = noisy_moons
X_data = StandardScaler().fit_transform(X)
xlim, ylim = [-2, 2], [-2, 2]


# Plot with labels
y_label = y.astype(np.bool)
X_train, Y_train = X_data[..., 0], X_data[..., 1]
plt.scatter(X_train[y_label], Y_train[y_label], s = 10, color = 'blue')
plt.scatter(X_train[y_label == False], Y_train[y_label == False], s = 10, color = 'red')
plt.legend(['label: 1', 'label: 0'])
plt.xlim(xlim)
plt.ylim(ylim)

(-2.0, 2.0)


# Define base distribution
base_distribution = tfd.Normal(loc = 0, scale = 1)


# Define the trainable distribution
def make_masked_autoregressive_flow(hidden_units = [16, 16], activation = 'relu'):
    made = tfb.AutoregressiveNetwork \
        (params = 2, event_shape = [2], hidden_units = hidden_units, activation = activation)
    return tfb.MaskedAutoregressiveFlow(shift_and_log_scale_fn = made)

trainable_distribution = tfd.TransformedDistribution \
    (base_distribution, make_masked_autoregressive_flow(), event_shape = [2])


from mpl_toolkits.axes_grid1 import make_axes_locatable
from tensorflow.compat.v1 import logging
logging.set_verbosity(logging.ERROR)


# Define a plot contour routine
def plot_contour_prob(dist, rows = 1, title = [''], scale_fig = 4):
    cols = int(len(dist) / rows)
    xx = np.linspace(-5.0, 5.0, 100)
    yy = np.linspace(-5.0, 5.0, 100)
    X, Y = np.meshgrid(xx, yy)

    fig, ax = plt.subplots(rows, cols, figsize = (scale_fig * cols, scale_fig * rows))
    fig.tight_layout(pad = 4.5)

    i = 0
    for r in range(rows):
        for c in range(cols):
            Z = dist[i].prob(np.dstack((X, Y)))
            if len(dist) == 1:
                axi = ax
            elif rows == 1:
                axi = ax[c]
            else:
                axi = ax[r, c]

            # Plot contour
            p = axi.contourf(X, Y, Z)

            # Add a colorbar
            divider = make_axes_locatable(axi)
            cax = divider.append_axes("right", size = "5%", pad = 0.1)
            cbar = fig.colorbar(p, cax = cax)

            # Set title and labels
            axi.set_title('Filled Contours Plot: ' + str(title[i]))
            axi.set_xlabel('x')
            axi.set_ylabel('y')

            i += 1
    plt.show()


# Plot contour
activation = 'sigmoid'
maf = tfd.TransformedDistribution \
    (base_distribution, make_masked_autoregressive_flow(activation = activation), event_shape = [2])
plot_contour_prob([maf], scale_fig = 6, title = [activation])


from tensorflow.keras.layers import Input
from tensorflow.keras import Model


# Make samples
x = base_distribution.sample((1000, 2))
names = [base_distribution.name, trainable_distribution.bijector.name]
samples = [x, trainable_distribution.bijector.forward(x)]


# Define a scatter plot routine for the bijectors
def _plot(results, rows=1, legend = False):
    cols = int(len(results) / rows)
    f, arr = plt.subplots(rows, cols, figsize = (4 * cols, 4 * rows))
    i = 0
    for r in range(rows):
        for c in range(cols):
            res = results[i]
            X, Y = res[..., 0].numpy(), res[..., 1].numpy()
            if rows == 1:
                p = arr[c]
            else:
                p = arr[r, c]
            p.scatter(X, Y, s = 10, color = 'red')
            p.set_xlim([-5, 5])
            p.set_ylim([-5, 5])
            p.set_title(names[i])
            
            i += 1


# Plot
_plot(samples)


from tensorflow.keras.callbacks import LambdaCallback


# Define a training routine
def train_dist_routine(trainable_distribution, n_epochs = 200, batch_size = None, n_disp = 100):
    x_ = Input(shape = (2,), dtype = tf.float32)
    log_prob_ = trainable_distribution.log_prob(x_)
    model = Model(x_, log_prob_)
    model.compile(optimizer = tf.optimizers.Adam(), loss = lambda _, log_prob: -log_prob)

    ns = X_data.shape[0]
    if batch_size is None:
        batch_size = ns

    # Display the loss every n_disp epoch
    epoch_callback = LambdaCallback (
        on_epoch_end = lambda epoch, logs:
            print('\n Epoch {}/{}'.format(epoch + 1, n_epochs, logs),
                  '\n\t ' + (': {:.4f}, '.join(logs.keys()) + ': {:.4f}').format(*logs.values()))
            if epoch % n_disp == 0 else False 
    )

    history = model.fit (
        x = X_data, y = np.zeros((ns, 0), dtype=np.float32),
        batch_size = batch_size, epochs = n_epochs,
        validation_split = 0.2, shuffle = True, verbose = False,
        callbacks = [epoch_callback]
    )
    return history


# Train the distribution
history = train_dist_routine(trainable_distribution, n_epochs = 600, n_disp = 50)

 Epoch 1/600 
	 loss: 2.9460, val_loss: 2.8624

 Epoch 51/600 
	 loss: 2.7376, val_loss: 2.6847

 Epoch 101/600 
	 loss: 2.6527, val_loss: 2.6195

 Epoch 151/600 
	 loss: 2.6311, val_loss: 2.6019

 Epoch 201/600 
	 loss: 2.6045, val_loss: 2.5796

 Epoch 251/600 
	 loss: 2.5639, val_loss: 2.5437

 Epoch 301/600 
	 loss: 2.4945, val_loss: 2.4838

 Epoch 351/600 
	 loss: 2.3925, val_loss: 2.3987

 Epoch 401/600 
	 loss: 2.2868, val_loss: 2.3155

 Epoch 451/600 
	 loss: 2.2146, val_loss: 2.2542

 Epoch 501/600 
	 loss: 2.1721, val_loss: 2.2096

 Epoch 551/600 
	 loss: 2.1409, val_loss: 2.1822


# Get losses
train_losses = history.history['loss']
valid_losses = history.history['val_loss']


# Plot loss vs epoch
plt.plot(train_losses, label = 'train')
plt.plot(valid_losses, label = 'valid')
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Negative log likelihood")
plt.title("Training and validation loss curves")
plt.show()


# Make samples
x = base_distribution.sample((1000, 2))
names = [base_distribution.name, trainable_distribution.bijector.name]
samples = [x, trainable_distribution.bijector.forward(x)]


# Plot
_plot(samples)


# Define a plot routine
def visualize_training_data(samples):
    f, arr = plt.subplots(1, 2, figsize = (15, 6))
    names = ['Data', 'Trainable']
    samples = [tf.constant(X_data), samples[-1]]

    for i in range(2):
        res = samples[i]
        X, Y = res[..., 0].numpy(), res[..., 1].numpy()
        arr[i].scatter(X, Y, s = 10, color = 'red')
        arr[i].set_xlim([-2, 2])
        arr[i].set_ylim([-2, 2])
        arr[i].set_title(names[i])

visualize_training_data(samples)


# Plot contour
plot_contour_prob([trainable_distribution], scale_fig = 6)


# Define a more expressive model
num_bijectors = 6
bijectors = []

for i in range(num_bijectors):
    masked_auto_i = make_masked_autoregressive_flow(hidden_units = [256, 256], activation = 'relu')
    bijectors.append(masked_auto_i)
    bijectors.append(tfb.Permute(permutation = [1, 0]))

flow_bijector = tfb.Chain(list(reversed(bijectors[:-1])))


# Define the trainable distribution
trainable_distribution = tfd.TransformedDistribution \
    (distribution = base_distribution, bijector = flow_bijector, event_shape = [2])


# Make samples
def make_samples():
    x = base_distribution.sample((1000, 2))
    samples = [x]
    names = [base_distribution.name]
    for bijector in reversed(trainable_distribution.bijector.bijectors):
        x = bijector.forward(x)
        samples.append(x)
        names.append(bijector.name)
    return names, samples

names, samples = make_samples()


# Plot
_plot(samples, 3)


# Plot
visualize_training_data(samples)


# Train the distribution
history = train_dist_routine(trainable_distribution, n_epochs = 600, n_disp = 50)

 Epoch 1/600 
	 loss: 2.9169, val_loss: 2.6941

 Epoch 51/600 
	 loss: 2.1496, val_loss: 2.1582

 Epoch 101/600 
	 loss: 1.8858, val_loss: 1.9126

 Epoch 151/600 
	 loss: 2.0485, val_loss: 2.0945

 Epoch 201/600 
	 loss: 1.9392, val_loss: 1.8634

 Epoch 251/600 
	 loss: 1.3229, val_loss: 1.4462

 Epoch 301/600 
	 loss: 1.2615, val_loss: 1.3872

 Epoch 351/600 
	 loss: 1.1725, val_loss: 1.3308

 Epoch 401/600 
	 loss: 1.1442, val_loss: 1.3434

 Epoch 451/600 
	 loss: 1.1349, val_loss: 1.3283

 Epoch 501/600 
	 loss: 1.1160, val_loss: 1.3346

 Epoch 551/600 
	 loss: 1.1374, val_loss: 1.3436


# Get losses
train_losses = history.history['loss']
valid_losses = history.history['val_loss']


# Plot loss vs epoch
plt.plot(train_losses, label = 'train')
plt.plot(valid_losses, label = 'valid')
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Negative log likelihood")
plt.title("Training and validation loss curves")
plt.show()


# Make samples and plot
names, samples = make_samples()
_plot(samples, 3)


# Plot
visualize_training_data(samples)


# Plot
plot_contour_prob([trainable_distribution], scale_fig = 6)


import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
print("TF version:", tf.__version__)
print("TFP version:", tfp.__version__)

# Additional packages for the reading
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Ellipse

TF version: 2.3.0
TFP version: 0.11.0


_, axs = plt.subplots(1, 2, sharex = True, sharey = True, figsize = (11, 5))

delta = 45.0  # degrees

q_ell_inf = Ellipse (
    (0, 0), 2, 1.5, 45, ec = 'blue', fc = 'none',
    alpha = 0.5, label = 'q(x)', hatch = '/'
)
q_ell_fin = Ellipse (
    (0, 0), 0.5, 0.75, 45, ec = 'blue', fc = 'none',
    alpha = 0.5, label = 'q(x)', hatch = '/'
)
p_ell_inf = Ellipse (
    (0, 0), 1, 1, 45, ec = 'red', fc = 'none',
    alpha = 0.5, label = 'p(x)', hatch = '\\'
)
p_ell_fin = Ellipse (
    (0, 0), 1, 1, 45, ec = 'red', fc = 'none',
    alpha = 0.5, label = 'p(x)', hatch = '\\'
)

# KL divergence is infinite
for ell in [q_ell_inf, p_ell_inf]:
    axs[0].add_artist(ell)
axs[0].legend([q_ell_inf, p_ell_inf], ['Support of q', 'Support of p'], loc = 'lower right')
axs[0].get_xaxis().set_ticks([])
axs[0].get_yaxis().set_ticks([])
    
# KL divergence is finite
for ell in [q_ell_fin, p_ell_fin]:
    axs[1].add_artist(ell)
axs[1].legend([q_ell_fin, p_ell_fin], ['Support of q', 'Support of p'], loc = 'lower right')
axs[1].get_xaxis().set_ticks([])
axs[1].get_yaxis().set_ticks([])

axs[0].set_title(r'$D_{KL}[q \ || \ p] = +\infty$')
axs[1].set_title(r'$D_{KL}[q \ || \ p]$ is finite but non-zero')
plt.xlim(-1, 1)
plt.ylim(-1, 1)

(-1.0, 1.0)


# Simple example
mu_q = 0.
sigma_q = 1.
mu_p = 0.
sigma_p = 0.5
distribution_q = tfd.Normal(loc = mu_q, scale = sigma_q)
distribution_p = tfd.Normal(loc = mu_p, scale = sigma_p)

tfd.kl_divergence(distribution_q, distribution_p) # D_{KL}[q || p]

<tf.Tensor: shape=(), dtype=float32, numpy=0.8068528>


# Analytical expression for KL divergence between two univariate Normals
0.5 * ((sigma_q / sigma_p) ** 2 + ((mu_q - mu_p) / sigma_p) ** 2 + 2 * np.log(sigma_p / sigma_q) - 1)

0.8068528194400546


# Batch example with broadcasting
distributions_q = tfd.Normal(loc = [0., 1.], scale = 1.)
distribution_p = tfd.Normal(loc = 0., scale = 0.5)


# Notice the batch_shape
distributions_q

<tfp.distributions.Normal 'Normal' batch_shape=[2] event_shape=[] dtype=float32>


# [D_{KL}[q_1 || p], D_{KL}[q_2 || p]
tfd.kl_divergence(distributions_q, distribution_p)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.8068528, 2.8068528], dtype=float32)>


# An example with another distribution
beta_q = tfd.Beta(concentration1 = 12, concentration0 = 3)
beta_p = tfd.Beta(concentration1 = 9, concentration0 = 3)
tfd.kl_divergence(beta_q, beta_p)

<tf.Tensor: shape=(), dtype=float32, numpy=0.09615421>


# An example with a multivariate distribution
cov_q = np.array([[1., 0.5], [0.5, 1.]])
cov_p = np.array([[1., 0.], [0., 1.]])
mvtnormal_q = tfd.MultivariateNormalTriL(loc = [0., 0.], scale_tril = tf.linalg.cholesky(cov_q))
mvtnormal_p = tfd.MultivariateNormalTriL(loc = [0., 0.], scale_tril = tf.linalg.cholesky(cov_p))
tfd.kl_divergence(mvtnormal_q, mvtnormal_p)

<tf.Tensor: shape=(), dtype=float64, numpy=0.14384103622589053>


# uniform_q and beta_p are both uniform distributions with support [0, 1]
uniform_q = tfd.Uniform(low = 0., high = 1.)
beta_p = tfd.Beta(concentration1 = 0., concentration0 = 0.)


# kl_divergence has no method for computing their divergence
try:
    tfd.kl_divergence(uniform_q, beta_p)
except Exception as e:
    print(e)

No KL(distribution_a || distribution_b) registered for distribution_a type Uniform and distribution_b type Beta


# Evaluate the exact KL divergence

distribution_q = tfd.Normal(loc=0., scale=1.)
distribution_p = tfd.Normal(loc=0., scale=0.5)

exact_kl_divergence = tfd.kl_divergence(distribution_q, distribution_p).numpy() # D_{KL}[q || p]
exact_kl_divergence

0.8068528


# Function to estimate the KL divergence with Monte Carlo samples
def monte_carlo_estimate_of_kl_divergence(n, q_sampler, q_density, p_density):
    '''
        Computes a Monte Carlo estimate of D_{KL}[q || p] using
        n samples from q_sampler.
        
        q_sampler is a function that receives a positive integer
        and returns as many samples from q.
        
        Given samples x_1, ..., x_n from q_sampler, the Monte Carlo
        estimate is
            
            \frac{1}{n}\sum_{i=1}^n \log(q(x_i)) - \log(p(x_i))
            
        where q and p are density/mass functions. 
    '''
    x = q_sampler(n)
    KL_estimate = np.mean(np.log(q_density(x)) - np.log(p_density(x)))
    return(KL_estimate)


# Single MC estimate
n = 1000 # number of samples used in MC estimate
q_sampler = distribution_q.sample
q_density = distribution_q.prob
p_density = distribution_p.prob

monte_carlo_estimate_of_kl_divergence(n, q_sampler, q_density, p_density)

0.8441007


# Create a grid of 8 points
n_grid = 10 ** np.arange(1, 8)
samples_per_grid_point = 100  # Number of MC estimates to make for each value of n


# Array to store results
kl_estimates = np.zeros(shape = [samples_per_grid_point, len(n_grid), 2])


# Make 100 MC estimates for each value of n, store the results in kl_estimates
for sample_num in range(samples_per_grid_point):
    for grid_num, n in enumerate(n_grid):
        kl_estimates[sample_num, grid_num, 0] = n
        kl_estimates[sample_num, grid_num, 1] = \
            monte_carlo_estimate_of_kl_divergence(n, q_sampler, q_density, p_density)


# Compute RMSE of estimates (this is approximately equal to the standard deviation of the MC estimator)
rmse_of_kl_estimates = np.sqrt \
    (np.mean((kl_estimates[:, :, 1] - exact_kl_divergence) ** 2, axis = 0))


# Compute absolute error of the MC estimates
abs_error_of_kl_estimates = abs(kl_estimates[:, :, 1].flatten() - exact_kl_divergence)


# Plot the results
_, ax = plt.subplots(1, 1, figsize = (15, 5))
plt.xlabel(r'Number of samples in Monte Carlo estimate, $n$')
ax.scatter (
    kl_estimates[:, :, 0], 
    abs_error_of_kl_estimates,
    marker = '.', color = 'red',
    alpha = 0.1, label = 'Absolute error of Monte Carlo estimates'
)
ax.plot(n_grid, rmse_of_kl_estimates, color = 'k', label = 'RMSE of Monte Carlo estimates')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_ylim([1e-6, 10])
ax.legend()

<matplotlib.legend.Legend at 0x7fed1c260d10>


import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors
print("TF version:", tf.__version__)
print("TFP version:", tfp.__version__)

# Additional packages for this reading
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output

TF version: 2.3.0
TFP version: 0.11.0


# Define the target distribution
tf.random.set_seed(41)

p_mu = [0., 0.]
p_Sigma = tfp.bijectors.Chain \
    ([tfb.TransformDiagonal(tfb.Softplus()), tfb.FillTriangular()])(tf.random.uniform([3]))
p = tfd.MultivariateNormalTriL(loc = p_mu, scale_tril = p_Sigma)


# Define the approximating distribution
scale_tril_init = tfb.FillScaleTriL()(tf.random.normal([3]))
q = tfd.MultivariateNormalTriL (
    loc = tf.Variable(tf.random.normal([2])),
    scale_tril = tfp.util.TransformedVariable(scale_tril_init, bijector = tfb.FillScaleTriL())
)


@tf.function
def loss_and_grads(dist_a, dist_b):
    '''
        Returns D_{KL}[dist_a || dist_b] and the gradients of this 
        with respect to the trainable Variables of dist_a.
    '''
    with tf.GradientTape() as tape:
        loss = tfd.kl_divergence(dist_a, dist_b)
    return loss, tape.gradient(loss, dist_a.trainable_variables)


# Define function for graphics

def plot_density_contours(density, X1, X2, contour_kwargs, ax = None):
    '''
        Plots the contours of a bivariate TensorFlow density function (i.e. .prob()).
        X1 and X2 are numpy arrays of mesh coordinates.
    '''
    X = np.hstack([X1.flatten()[:, np.newaxis], X2.flatten()[:, np.newaxis]])
    density_values = np.reshape(density(X).numpy(), newshape = X1.shape)
    
    if ax == None:
        _, ax = plt.subplots(figsize = (7, 7))
    
    ax.contour(X1, X2, density_values, **contour_kwargs)
    return(ax)

x1 = np.linspace(-5, 5, 1000)
x2 = np.linspace(-5, 5, 1000)
X1, X2 = np.meshgrid(x1, x2)
contour_levels = np.linspace(1e-4, 10 ** (-0.8), 20)


# Set up and run a custom training loop to minimise the KL loss

num_train_steps = 1000
opt = tf.keras.optimizers.Adam(learning_rate = .01)
for i in range(num_train_steps):
    
    # Compute the KL divergence and its gradients
    q_loss, grads = loss_and_grads(q, p)
    
    # Update the trainable variables using the gradients via the optimizer
    opt.apply_gradients(zip(grads, q.trainable_variables))
    
    # Plot the updated density 
    if ((i + 1) % 10 == 0):
        clear_output(wait = True)
        ax = plot_density_contours \
            (p.prob, X1, X2, {'levels': contour_levels, 'cmap': 'cividis', 'alpha': 0.5})
        ax = plot_density_contours \
            (q.prob, X1, X2, {'levels': contour_levels, 'cmap': 'plasma'}, ax = ax)
        ax.set_title (
            'Density contours of $p$ and $q$\n' +
                'Iteration ' + str(i + 1) + '\n' +
                '$D_{KL}[q \ || \ p] = ' + 
                str(np.round(q_loss.numpy(), 4)) + '$',
            loc = 'left'
        )
        plt.pause(.01)


import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
tfpl = tfp.layers
tfb = tfp.bijectors
print("TF version:", tf.__version__)
print("TFP version:", tfp.__version__)

TF version: 2.3.0
TFP version: 0.11.0


from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Reshape
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Load Fashion MNIST
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
class_names = np.array ([
    'T-shirt/top', 'Trouser/pants', 'Pullover shirt', 'Dress',
    'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag','Ankle boot'
])


# Display a few examples
n_examples = 1000
example_images = x_test[0:n_examples]
example_labels = y_test[0:n_examples]

f, axs = plt.subplots(1, 5, figsize = (15, 4))
for j in range(len(axs)):
    axs[j].imshow(example_images[j], cmap = 'binary')
    axs[j].axis('off')


# Define the encoder
encoded_dim = 2
encoder = Sequential ([
    Flatten(input_shape = (28, 28)),
    Dense(256, activation = 'sigmoid'),
    Dense(64, activation = 'sigmoid'),
    Dense(encoded_dim)
])


# Encode examples before training
pretrain_example_encodings = encoder(example_images).numpy()


# Plot encoded examples before training 
f, ax = plt.subplots(1, 1, figsize = (7, 7))
sns.scatterplot (
    pretrain_example_encodings[:, 0],
    pretrain_example_encodings[:, 1],
    hue = class_names[example_labels], ax = ax,
    palette = sns.color_palette("colorblind", 10)
)
ax.set_xlabel('Encoding dimension 1')
ax.set_ylabel('Encoding dimension 2')
ax.set_title('Encodings of example images before training')

/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

Text(0.5, 1.0, 'Encodings of example images before training')


# Define the decoder
decoder = Sequential ([
    Dense(64, activation = 'sigmoid', input_shape = (encoded_dim,)),
    Dense(256, activation = 'sigmoid'),
    Dense(28 * 28, activation = 'sigmoid'),
    Reshape((28, 28))
])


# Compile and fit the model
autoencoder = Model(inputs = encoder.inputs, outputs = decoder(encoder.outputs))

# Specify loss - input and output is in [0., 1.], so we can use a binary cross-entropy loss
autoencoder.compile(loss = 'binary_crossentropy')

# Fit model - highlight that labels and input are the same
autoencoder.fit(x = x_train, y = x_train, epochs = 10, batch_size = 32)

Epoch 1/10
1875/1875 [==============================] - 8s 4ms/step - loss: 0.3780
Epoch 2/10
1875/1875 [==============================] - 8s 4ms/step - loss: 0.3425
Epoch 3/10
1875/1875 [==============================] - 8s 4ms/step - loss: 0.3350
Epoch 4/10
1875/1875 [==============================] - 8s 5ms/step - loss: 0.3305
Epoch 5/10
1875/1875 [==============================] - 9s 5ms/step - loss: 0.3276
Epoch 6/10
1875/1875 [==============================] - 8s 5ms/step - loss: 0.3258
Epoch 7/10
1875/1875 [==============================] - 8s 4ms/step - loss: 0.3243
Epoch 8/10
1875/1875 [==============================] - 8s 5ms/step - loss: 0.3233
Epoch 9/10
1875/1875 [==============================] - 8s 4ms/step - loss: 0.3223
Epoch 10/10
1875/1875 [==============================] - 8s 5ms/step - loss: 0.3215

<tensorflow.python.keras.callbacks.History at 0x7f867408a910>


# Compute example encodings after training
posttrain_example_encodings = encoder(example_images).numpy()


# Compare the example encodings before and after training
f, axs = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 7))
sns.scatterplot (
    pretrain_example_encodings[:, 0],
    pretrain_example_encodings[:, 1],
    hue = class_names[example_labels], ax = axs[0],
    palette = sns.color_palette("colorblind", 10)
)
sns.scatterplot (
    posttrain_example_encodings[:, 0],
    posttrain_example_encodings[:, 1],
    hue = class_names[example_labels], ax = axs[1],
    palette = sns.color_palette("colorblind", 10)
)

axs[0].set_title('Encodings of example images before training')
axs[1].set_title('Encodings of example images after training')

for ax in axs: 
    ax.set_xlabel('Encoding dimension 1')
    ax.set_ylabel('Encoding dimension 2')
    ax.legend(loc = 'lower right')

/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning


# Compute the autoencoder's reconstructions
reconstructed_example_images = autoencoder(example_images)


# Evaluate the autoencoder's reconstructions
f, axs = plt.subplots(2, 5, figsize = (15, 4))
for j in range(5):
    axs[0, j].imshow(example_images[j], cmap = 'binary')
    axs[1, j].imshow(reconstructed_example_images[j].numpy().squeeze(), cmap = 'binary')
    axs[0, j].axis('off')
    axs[1, j].axis('off')


import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output


# Define a target distribution, p
tf.random.set_seed(41)
p_mu = [0., 0.]
p_L = tfb.Chain([tfb.TransformDiagonal(tfb.Softplus()), tfb.FillTriangular()]) \
    (tf.random.uniform([3]))
p = tfd.MultivariateNormalTriL(loc = p_mu, scale_tril = p_L)


# Plot the target distribution's density contours
def plot_density_contours(density, X1, X2, contour_kwargs, ax = None):
    '''
        Plots the contours of a bivariate TensorFlow density function (i.e. .prob()).
        X1 and X2 are numpy arrays of mesh coordinates.
    '''
    X = np.hstack([X1.flatten()[:, np.newaxis], X2.flatten()[:, np.newaxis]])
    density_values = np.reshape(density(X).numpy(), newshape = X1.shape)
    
    if ax == None:
        _, ax = plt.subplots(figsize = (7, 7))
    
    ax.contour(X1, X2, density_values, **contour_kwargs)
    return(ax)

x1 = np.linspace(-5, 5, 1000)
x2 = np.linspace(-5, 5, 1000)
X1, X2 = np.meshgrid(x1, x2)
f, ax = plt.subplots(1, 1, figsize = (7, 7))

# Density contours are linearly spaced
contour_levels = np.linspace(1e-4, 10 ** -0.8, 20) # specific to this seed
ax = plot_density_contours \
    (p.prob, X1, X2, {'levels': contour_levels,  'cmap': 'cividis'}, ax = ax)
ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)
ax.set_title('Density contours of target distribution, $p$')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')

Text(0, 0.5, '$x_2$')


# Initialize an approximating distribution, q, that has diagonal covariance
tf.random.set_seed(41)
q = tfd.MultivariateNormalDiag (
    loc = tf.Variable(tf.random.normal([2])),
    scale_diag = tfp.util.TransformedVariable(tf.random.uniform([2]), bijector = tfb.Exp())
)


# Define a function for the Kullback-Leibler divergence
@tf.function
def loss_and_grads(dist_a, dist_b, reverse = False):
    with tf.GradientTape() as tape:
        if not reverse:
            loss = tfd.kl_divergence(dist_a, dist_b)
        else:
            loss = tfd.kl_divergence(dist_b, dist_a)
    return loss, tape.gradient(loss, dist_a.trainable_variables)


# Run a training loop that computes KL[q || p], updates q's parameters using its gradients
num_train_steps = 250
opt = tf.keras.optimizers.Adam(learning_rate = .01)

for i in range(num_train_steps):
    
    # Compute the KL divergence and its gradients
    q_loss, grads = loss_and_grads(q, p)
    
    # Update the trainable variables using the gradients via the optimizer
    opt.apply_gradients(zip(grads, q.trainable_variables))
    
    # Plot the updated density 
    if ((i + 1) % 10 == 0):
        clear_output(wait = True)
        ax = plot_density_contours \
            (p.prob, X1, X2, {'levels': contour_levels, 'cmap': 'cividis', 'alpha': 0.5})
        ax = plot_density_contours \
            (q.prob, X1, X2, {'levels': contour_levels, 'cmap': 'plasma'}, ax = ax)
        ax.set_title (
            'Density contours of $p$ and $q$\n' +
                'Iteration ' + str(i + 1) + '\n' +
                '$D_{KL}[q \ || \ p] = ' + 
                str(np.round(q_loss.numpy(), 4)) + '$'
            ,
            loc = 'left'
        )
        plt.pause(.01)


# Re-fit the distribution, this time fitting q_rev by minimising KL[p || q_rev]
tf.random.set_seed(41)
q_rev = tfd.MultivariateNormalDiag (
    loc = tf.Variable(tf.random.normal([2])),
    scale_diag = tfp.util.TransformedVariable(tf.random.uniform([2]), bijector = tfb.Exp())
)


# Edit loss_and_grads function


# Re-initialize optimizer, run training loop
opt = tf.keras.optimizers.Adam(learning_rate = .01)
for i in range(num_train_steps):
    # Reverse the KL divergence terms - compute KL[p || q_rev] 
    q_rev_loss, grads = loss_and_grads(q_rev, p, reverse = True)
    
    # Update the trainable variables using the gradients via the optimizer
    opt.apply_gradients(zip(grads, q_rev.trainable_variables))
    
    # Plot the updated density 
    if ((i + 1) % 10 == 0):
        clear_output(wait=True)
        ax = plot_density_contours \
            (p.prob, X1, X2, {'levels': contour_levels, 'cmap': 'cividis', 'alpha': 0.5})
        ax = plot_density_contours \
            (q_rev.prob, X1, X2, {'levels': contour_levels, 'cmap': 'plasma'}, ax = ax)
        ax.set_title (
            'Density contours of $p$ and $q_{rev}$\n' +
                'Iteration ' + str(i + 1) + '\n' +
                '$D_{KL}[p \ || \ q_{rev}] = ' +
                str(np.round(q_rev_loss.numpy(), 4)) + '$'
            ,
            loc = 'left'
        )
        plt.pause(.01)


# Plot q and q_rev alongside one another
f, axs = plt.subplots(1, 2, figsize = (15, 7))

axs[0] = plot_density_contours \
    (p.prob, X1, X2, {'levels': contour_levels, 'cmap': 'cividis', 'alpha': 0.5}, ax = axs[0])
axs[0] = plot_density_contours \
    (q.prob, X1, X2, {'levels': contour_levels, 'cmap': 'plasma'}, ax = axs[0])
axs[0].set_title (
    'Density contours of $p$ and $q$\n' +
        '$D_{KL}[q \ || \ p] = ' + str(np.round(q_loss.numpy(), 4)) + '$'
    ,
    loc = 'left'
)

axs[1] = plot_density_contours \
    (p.prob, X1, X2, {'levels': contour_levels, 'cmap': 'cividis', 'alpha': 0.5}, ax = axs[1])
axs[1] = plot_density_contours \
    (q_rev.prob, X1, X2, {'levels': contour_levels, 'cmap': 'plasma'}, ax = axs[1])
axs[1].set_title (
    'Density contours of $p$ and $q_{rev}$\n' +
        '$D_{KL}[p \ || \ q_{rev}] = ' + str(np.round(q_rev_loss.numpy(), 4)) + '$'
    ,
    loc = 'left'
)

Text(0.0, 1.0, 'Density contours of $p$ and $q_{rev}$\n$D_{KL}[p \\ || \\ q_{rev}] = 0.1788$')


from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Reshape
import matplotlib.pyplot as plt
import numpy as np


# Import Fasion MNIST, make it a TensorFlow Dataset
(x_train, _), (x_test, _) = tf.keras.datasets.fashion_mnist.load_data()
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
example_x = x_test[:16]

batch_size = 64
x_train = tf.data.Dataset.from_tensor_slices(x_train).batch(batch_size)


# Define the encoding distribution, q(z|x)
latent_size = 2
event_shape = (28, 28)
encoder = Sequential ([
    Flatten(input_shape = event_shape),
    Dense(256, activation = 'relu'),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(32, activation = 'relu'),
    Dense(2 * latent_size),
    tfpl.DistributionLambda (
        lambda t: tfd.MultivariateNormalDiag \
            (loc = t[..., :latent_size], scale_diag = tf.math.exp(t[..., latent_size:]))
    )
])

WARNING:tensorflow:From /home/bacti/anaconda3/envs/tensor/lib/python3.7/site-packages/tensorflow/python/ops/linalg/linear_operator_diag.py:166: calling LinearOperator.__init__ (from tensorflow.python.ops.linalg.linear_operator) with graph_parents is deprecated and will be removed in a future version.
Instructions for updating:
Do not pass `graph_parents`.  They will  no longer be used.


# Pass an example image through the network - should return a batch of MultivariateNormalDiags.
encoder(example_x)

<tfp.distributions.MultivariateNormalDiag 'sequential_2_distribution_lambda_MultivariateNormalDiag' batch_shape=[16] event_shape=[2] dtype=float32>


# Define the decoding distribution, p(x|z)
decoder = Sequential ([
    Dense(32, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(128, activation = 'relu'),
    Dense(256, activation = 'relu'),
    Dense(tfpl.IndependentBernoulli.params_size(event_shape)),
    tfpl.IndependentBernoulli(event_shape)
])


# Pass a batch of examples to the decoder
decoder(tf.random.normal([16, latent_size]))

<tfp.distributions.Independent 'sequential_3_independent_bernoulli_IndependentBernoulli_Independentsequential_3_independent_bernoulli_IndependentBernoulli_Bernoulli' batch_shape=[16] event_shape=[28, 28] dtype=float32>


# Define the prior, p(z) - a standard bivariate Gaussian
prior = tfd.MultivariateNormalDiag(loc = tf.zeros(latent_size))


# Specify the loss function, an estimate of the -ELBO
def loss(x, encoding_dist, sampled_decoding_dist, prior):
    return tf.reduce_sum(tfd.kl_divergence(encoding_dist, prior) - sampled_decoding_dist.log_prob(x))


# Define a function that returns the loss and its gradients
@tf.function
def get_loss_and_grads(x):
    with tf.GradientTape() as tape:
        encoding_dist = encoder(x)
        sampled_z = encoding_dist.sample()
        sampled_decoding_dist = decoder(sampled_z)
        current_loss = loss(x, encoding_dist, sampled_decoding_dist, prior)
    grads = tape.gradient(current_loss, encoder.trainable_variables + decoder.trainable_variables)
    return current_loss, grads


# Compile and train the model
num_epochs = 5
opt = tf.keras.optimizers.Adam()
for i in range(num_epochs):
    for train_batch in x_train:
        current_loss, grads = get_loss_and_grads(train_batch)
        opt.apply_gradients(zip(grads, encoder.trainable_variables + decoder.trainable_variables))
        
    print('-ELBO after epoch {}: {:.0f}'.format(i + 1, current_loss.numpy()))

-ELBO after epoch 1: 9014
-ELBO after epoch 2: 8835
-ELBO after epoch 3: 8832
-ELBO after epoch 4: 8787
-ELBO after epoch 5: 8762


# Connect encoder and decoder, compute a reconstruction
def vae(inputs):
    approx_posterior = encoder(inputs)
    decoding_dist = decoder(approx_posterior.sample())
    return decoding_dist.mean()

example_reconstruction = vae(example_x).numpy().squeeze()


# Plot examples against reconstructions
f, axs = plt.subplots(2, 6, figsize = (16, 5))
for j in range(6):
    axs[0, j].imshow(example_x[j, :, :].squeeze(), cmap = 'binary')
    axs[1, j].imshow(example_reconstruction[j, :, :], cmap = 'binary')
    axs[0, j].axis('off')
    axs[1, j].axis('off')


# Generate an example - sample a z value, then sample a reconstruction from p(x|z)
z = prior.sample(6)
generated_x = decoder(z).mean()


# Display generated_x
f, axs = plt.subplots(1, 6, figsize = (16, 5))
for j in range(6):
    axs[j].imshow(generated_x[j, :, :].numpy().squeeze(), cmap = 'binary')
    axs[j].axis('off')


# -ELBO estimate using an estimate of the KL divergence
def loss(x, encoding_dist, sampled_decoding_dist, prior, sampled_z):
    recon_loss = -sampled_decoding_dist.log_prob(x)
    kl_approx = (encoding_dist.log_prob(sampled_z) - prior.log_prob(sampled_z))
    return tf.reduce_sum(kl_approx + recon_loss)


from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Reshape
import matplotlib.pyplot as plt
import numpy as np


# Import Fashion MNIST
(x_train, _), (x_test, _) = tf.keras.datasets.fashion_mnist.load_data()
x_train = x_train.astype('float32') / 256. + 0.5 / 256
x_test = x_test.astype('float32') / 256. + 0.5 / 256
example_x = x_test[:16]

batch_size = 32
x_train = tf.data.Dataset.from_tensor_slices((x_train, x_train)).batch(batch_size)
x_test = tf.data.Dataset.from_tensor_slices((x_test, x_test)).batch(batch_size)


# Define latent_size and the prior, p(z)
latent_size = 4
prior = tfd.MultivariateNormalDiag(loc = tf.zeros(latent_size))


# Define the encoding distribution using a tfpl.KLDivergenceAddLoss layer
event_shape = (28, 28)
encoder = Sequential ([
    Flatten(input_shape = event_shape),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(32, activation = 'relu'),
    Dense(16, activation = 'relu'),
    Dense(tfpl.MultivariateNormalTriL.params_size(latent_size)),
    tfpl.MultivariateNormalTriL(latent_size),
    tfpl.KLDivergenceAddLoss(prior) # estimates KL[ q[z|x] || p(z) ]
])

# samples z_j from q(z|x_j)
# then computes log q(z_j|x_j) - log p(z_j)


# See how `KLDivergenceAddLoss` affects `encoder.losses`
# encoder.losses before the network has received any inputs
encoder.losses

[<tf.Tensor 'kl_divergence_add_loss/kldivergence_loss/batch_total_kl_divergence:0' shape=() dtype=float32>]


# Pass a batch of images through the encoder
encoder(example_x)

<tfp.distributions.MultivariateNormalTriL 'sequential_4_multivariate_normal_tri_l_MultivariateNormalTriL_MultivariateNormalTriL' batch_shape=[16] event_shape=[4] dtype=float32>


# See how encoder.losses has changed
encoder.losses

[<tf.Tensor: shape=(), dtype=float32, numpy=0.35394907>]


# Re-specify the encoder using `weight` and `test_points_fn`
encoder = Sequential ([
    Flatten(input_shape = event_shape),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(32, activation = 'relu'),
    Dense(16, activation = 'relu'),
    Dense(tfpl.MultivariateNormalTriL.params_size(latent_size)),
    tfpl.MultivariateNormalTriL(latent_size),
    tfpl.KLDivergenceAddLoss (
        prior, use_exact_kl = False, weight = 1.5,
        test_points_fn = lambda q: q.sample(10),
        test_points_reduce_axis = 0
    ) # estimates KL[ q[z|x] || p(z) ]
])

# (n_samples, batch_size, dim_z)
# z_{ij} is the ith sample for x_j (is at (i,j,:) in tensor of samples)
# is mapped to log q(z_{ij}|x_j) - log p(z_{ij})
# => tensor of KL divergences has shape (n_samples, batch_size)


# Replacing `KLDivergenceAddLoss`  with `KLDivergenceRegularizer` in the previous (probabilistic) layer
divergence_regularizer = tfpl.KLDivergenceRegularizer (
    prior, use_exact_kl = False,
    test_points_fn = lambda q: q.sample(5),
    test_points_reduce_axis = 0
)

encoder = Sequential ([
    Flatten(input_shape = event_shape),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(32, activation = 'relu'),
    Dense(16, activation = 'relu'),
    Dense(tfpl.MultivariateNormalTriL.params_size(latent_size)),
    tfpl.MultivariateNormalTriL(latent_size, activity_regularizer = divergence_regularizer)
])


# Specify the decoder, p(x|z)
decoder = Sequential ([
    Dense(16, activation = 'sigmoid', input_shape = (latent_size,)),
    Dense(32, activation = 'sigmoid'),
    Dense(64, activation = 'sigmoid'),
    Dense(2 * event_shape[0] * event_shape[1], activation = 'exponential'),
    Reshape((event_shape[0], event_shape[1], 2)),
    tfpl.DistributionLambda \
        (lambda t: tfd.Independent(tfd.Beta(concentration1 = t[..., 0], concentration0 = t[..., 1])))
])


# Connect the encoder and decoder to form the VAE
vae = Model(inputs = encoder.inputs, outputs = decoder(encoder.outputs))


# Define a loss that only estimates the expected reconstruction error,
# -E_{Z ~ q(z|x)}[log p(x|Z)]
def log_loss(x_true, p_x_given_z):
    return -tf.reduce_sum(p_x_given_z.log_prob(x_true))


# Compile and fit the model
vae.compile(loss = log_loss)
vae.fit(x_train, validation_data = x_test, epochs = 10)

Epoch 1/10
1875/1875 [==============================] - 41s 22ms/step - loss: -45334.5352 - val_loss: -51362.8516
Epoch 2/10
1875/1875 [==============================] - 39s 21ms/step - loss: -53343.5117 - val_loss: -54360.8320
Epoch 3/10
1875/1875 [==============================] - 40s 21ms/step - loss: -56219.7969 - val_loss: -57634.0352
Epoch 4/10
1875/1875 [==============================] - 41s 22ms/step - loss: -57441.0781 - val_loss: -58558.2773
Epoch 5/10
1875/1875 [==============================] - 40s 21ms/step - loss: -57147.9180 - val_loss: -57236.7695
Epoch 6/10
1875/1875 [==============================] - 40s 21ms/step - loss: -59585.8672 - val_loss: -59426.6172
Epoch 7/10
1875/1875 [==============================] - 39s 21ms/step - loss: -62342.7188 - val_loss: -62573.9766
Epoch 8/10
1875/1875 [==============================] - 39s 21ms/step - loss: -64889.9609 - val_loss: -65869.4219
Epoch 9/10
1875/1875 [==============================] - 39s 21ms/step - loss: -66032.7812 - val_loss: -67289.3594
Epoch 10/10
1875/1875 [==============================] - 40s 21ms/step - loss: -65745.0938 - val_loss: -68463.7500

<tensorflow.python.keras.callbacks.History at 0x7f86dd8279d0>


# Generate an example reconstruction
example_reconstruction = vae(example_x).mean().numpy().squeeze()


# Plot the example reconstructions
f, axs = plt.subplots(2, 6, figsize = (16, 5))

for j in range(6):
    axs[0, j].imshow(example_x[j, :, :].squeeze(), cmap = 'binary')
    axs[1, j].imshow(example_reconstruction[j, :, :], cmap = 'binary')
    axs[0, j].axis('off')
    axs[1, j].axis('off')

`B.shape`	`x.shape`	`y.shape`
`(2, 2)`	`(2)`	`(2)`
`(n, n)`	`(m)`	`ERROR`
`(n, n)`	`(n)`	`(n)`
`(n, n)`	`(s, n)`	`(s, n)`
`(b, n, n)`	`(n)`	`(b, n)`
`(b, n, n)`	`(b, n)`	`(b, n)`
`(b, n, n)`	`(s, 1, n)`	`(s, b, n)`

Probabilistic Deep Learning with TensorFlow 2

Multivariate Gaussian with full covariance

Full covariance with MultivariateNormalFullTriL¶

The Cholesky decomposition¶

tf.linalg.cholesky¶

What about positive semi-definite matrices?¶

Putting it all together¶

Deprecated: MultivariateNormalFullCovariance¶

Further reading and resources¶

Broadcasting rules

Operations on arrays of different sizes in numpy¶

Numpy's broadcasting rule¶

Broadcasting for univariate TensorFlow Distributions¶

Broadcasting with prob and log_prob methods¶

Broadcasting for multivariate TensorFlow distributions¶

Further reading and resources¶

Tensorflow Distributions

1. Univariate Distributions¶

2. Multivariate Distributions¶

3. The Independent Distribution¶

4. Sampling and log probs¶

5. Trainable Distributions¶

Univariate distributions¶

A word of caution on discrete distributions¶

Work with batch distributions¶

Multivariate Distributions¶

Basic multivariate distributions¶

Batches of multivariate distributions¶

The Independent Distribution¶

Shifting batch dimensions to event dimensions using¶

Using Independent to build a Naive Bayes classifier¶

Introduction to newsgroups data set¶

A Naive Bayes classifier for newsgroup¶

Sampling and log probs¶

Naive Bayes example¶

Computing log_probs¶

Trainable Distributions¶

Maximum likelihood estimation

Introduction¶

Probability mass and probability density functions¶

The likelihood function¶

Bernoulli distribution¶

Normal (Gaussian) distribution¶

Maximum likelihood estimation¶

The negative log-likelihood¶

Training neural networks¶

Bernoulli distribution: binary classifiers¶

Normal distribution: least squares regression¶

Conclusion¶

Further reading and resources¶

Bayes by backprop

Introduction¶

Bayesian learning¶

Bayesian neural network with weight uncertainty -- in principle¶

Variational Bayes¶

A backpropagation scheme¶

The idea¶

The reparameterization trick¶

Implementation¶

Minibatches¶

Conclusion¶

Further reading and resources¶

Probabilistic Layers And Bayesian Neural Networks

The DistributionLambda layer¶

Create a probabilistic model using the DistributionLambda layer¶

Use the forward model to create probabilistic training data¶

Create a new probabilistic model with the wrong weights¶

Train the new model with the negative loglikelihood¶

Probabilistic layers¶

Create data¶

Deterministic linear regression with MSE loss¶

Probabilistic linear regression with both user-defined and learned variance¶

Probabilistic linear regression with nonlinear learned mean & variance¶

The DenseVariational layer¶

Create linear data with Gaussian noise¶

Create the prior and posterior distribution for model weights¶

Aside: analytical posterior¶

\mathbf{\mu}¶

Create the model with DenseVariational layers¶

Train model and inspect¶

Full covariance with `MultivariateNormalFullTriL`¶

`tf.linalg.cholesky`¶

Deprecated: `MultivariateNormalFullCovariance`¶

Broadcasting with `prob` and `log_prob` methods¶

1. Univariate Distributions ¶

2. Multivariate Distributions ¶

3. The Independent Distribution ¶

4. Sampling and log probs ¶

5. Trainable Distributions ¶

Using `Independent` to build a Naive Bayes classifier¶

Introduction to `newsgroups` data set¶

A Naive Bayes classifier for `newsgroup`¶

The `DistributionLambda` layer¶

Create a probabilistic model using the `DistributionLambda` layer¶

The `DenseVariational` layer¶

Create the model with `DenseVariational` layers¶

The `ScaleMatvec` bijectors¶

The `ScaleMatvecDiag` bijector¶

The `ScaleMatvecTriL` bijector¶

The `LinearOperator` class and `ScaleMatvecLinearOperator` bijector¶

The `LinearOperatorDiag` class¶

The `LinearOperatorFullMatrix` class¶