Code
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
import plotly.graph_objects as go
from plotly.subplots import make_subplotsimport numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
import plotly.graph_objects as go
from plotly.subplots import make_subplotsGoal: Take a sequence of amino acids and predict a their three-dimensional shape
Problem: the conformational space is too big to do the guess and check approach
Why do we care?
It provides a good framework to test different neural network architectures
Protein structure is important in determining function
It speeds up research and discovery
A protein cannot sample all possible conformations randomly to reach its native fold, because doing so would take longer than the age of the universe.
For 10 residues:
# Protein Folding Parameters
residues = 10 # Residues in the polypeptide chain
angles_per_residue = 3 # Conformational choices (φ, ψ, χ angles)
correct_structure = tuple(np.random.randint(0, angles_per_residue, size=residues))
n_trials = 200 # Simulation trials for statistics
max_steps = 10000 # Max attempts for random search before failure
# RANDOM SEARCH: Brute-force folding exploration
def random_search():
for step in range(1, max_steps + 1):
guess = tuple(np.random.randint(0, angles_per_residue, size=residues))
if guess == correct_structure:
return step
return None # Failed to fold in max_steps
# GUIDED SEARCH: Energy landscape downhill folding (local folding first)
def guided_search():
# Simulate folding one residue at a time correctly
return residues # Always folds in N steps
# Run Experiments
random_steps = [random_search() for _ in range(n_trials)]
success_steps = [r for r in random_steps if r is not None]
failures = n_trials - len(success_steps)
# Plot Results
plt.figure(figsize=(8, 6))
# Random Search Distribution
if success_steps:
plt.hist(success_steps, bins=30, color='crimson', alpha=0.7, label='Random Search (Successes)')
# Guided Search Benchmark Line
plt.axvline(x=guided_search(), color='green', linestyle='--', linewidth=2, label='Guided Search (Energy Funnel)')
plt.title("Levinthal’s Paradox: Why Proteins Don’t Search Randomly", fontsize=16)
plt.xlabel("Steps Required to Fold", fontsize=13)
plt.ylabel("Frequency (Trials)", fontsize=13)
plt.text(0.25, 0.65, f"Random Search Failures: {failures}/{n_trials}",
transform=plt.gca().transAxes, fontsize=12, color='black')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
What random sampling misses:
# Set up funnel surface
x = np.linspace(-2, 2, 200)
y = np.linspace(-2, 2, 200)
X, Y = np.meshgrid(x, y)
Z = (X**2 + Y**2) + 0.3 * np.sin(5*X)*np.cos(5*Y) # energy landscape
# Simulated path for the rolling ball (spiral into the center)
theta = np.linspace(0, 6 * np.pi, 150)
r = np.linspace(2, 0.1, 150)
ball_x = r * np.cos(theta)
ball_y = r * np.sin(theta)
ball_z = (ball_x**2 + ball_y**2) + 0.3 * np.sin(5*ball_x)*np.cos(5*ball_y)
# Plot setup
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, Y, Z, cmap='coolwarm', edgecolor='none', alpha=0.9)
# Plot initial ball and trail
ball, = ax.plot([], [], [], 'ro', markersize=8) # red ball
trail, = ax.plot([], [], [], 'r--', linewidth=1, alpha=0.6) # trail
ax.set_title("Protein Folding Funnel: Thermodynamic Landscape")
ax.set_xlabel("Conformational Space X")
ax.set_ylabel("Conformational Space Y")
ax.set_zlabel("Free Energy")
ax.view_init(elev=30, azim=45)
def init():
ball.set_data([], [])
ball.set_3d_properties([])
trail.set_data([], [])
trail.set_3d_properties([])
return ball, trail
def update(frame):
ball.set_data(ball_x[frame], ball_y[frame])
ball.set_3d_properties(ball_z[frame])
trail.set_data(ball_x[:frame+1], ball_y[:frame+1])
trail.set_3d_properties(ball_z[:frame+1])
return ball, trail
ani = FuncAnimation(fig, update, frames=len(ball_x), init_func=init, blit=True, interval=50)
plt.show()
Thermodynamic principles can reduce the conformational space and ‘guide’ protein folding:
# Generate surface
x = np.linspace(-2, 2, 100)
y = np.linspace(-2, 2, 100)
X, Y = np.meshgrid(x, y)
Z = (X**2 + Y**2) + 0.3 * np.sin(5*X)*np.cos(5*Y)
# Spiral path
theta = np.linspace(0, 6 * np.pi, 150)
r = np.linspace(2, 0.1, 150)
path_x = r * np.cos(theta)
path_y = r * np.sin(theta)
path_z = (path_x**2 + path_y**2) + 0.3 * np.sin(5*path_x)*np.cos(5*path_y)
# Create interactive 3D plot
fig = go.Figure()
# Add surface
fig.add_trace(go.Surface(z=Z, x=X, y=Y, colorscale='RdBu', opacity=0.9, showscale=False))
# Add folding path
fig.add_trace(go.Scatter3d(
x=path_x, y=path_y, z=path_z,
mode='lines+markers',
line=dict(color='black', width=4),
marker=dict(size=2, color='black'),
name='Folding Path'
))
# Add start and end markers
fig.add_trace(go.Scatter3d(
x=[path_x[0]], y=[path_y[0]], z=[path_z[0]],
mode='markers',
marker=dict(size=6, color='blue'),
name='Unfolded Start'
))
fig.add_trace(go.Scatter3d(
x=[path_x[-1]], y=[path_y[-1]], z=[path_z[-1]],
mode='markers',
marker=dict(size=6, color='green'),
name='Native Fold'
))
# Layout
fig.update_layout(
title='Protein Folding Funnel (Interactive)',
scene=dict(
xaxis_title='Conformation X',
yaxis_title='Conformation Y',
zaxis_title='Free Energy'
),
margin=dict(l=0, r=0, b=0, t=30)
)
fig.show()Given discrete sequential information we want to figure out what the whole object looks like
Large Language Models (LLMs) were built to solve similar problems
Language:
Proteins:
Carbon backbone (𝜶-Carbon)
Carboxyl group
Amine group
Sidechain

# Generate surface
x = np.linspace(-2, 2, 100)
y = np.linspace(-2, 2, 100)
X, Y = np.meshgrid(x, y)
Z = (X**2 + Y**2) + 0.3 * np.sin(5*X)*np.cos(5*Y)
# Spiral path
theta = np.linspace(0, 6 * np.pi, 150)
r = np.linspace(2, 0.1, 150)
path_x = r * np.cos(theta)
path_y = r * np.sin(theta)
path_z = (path_x**2 + path_y**2) + 0.3 * np.sin(5*path_x)*np.cos(5*path_y)
# Create interactive 3D plot
fig = go.Figure()
# Add surface
fig.add_trace(go.Surface(z=Z, x=X, y=Y, colorscale='RdBu', opacity=0.9, showscale=False))
# Add folding path
fig.add_trace(go.Scatter3d(
x=path_x, y=path_y, z=path_z,
mode='lines+markers',
line=dict(color='black', width=4),
marker=dict(size=2, color='black'),
name='Folding Path'
))
# Add start and end markers
fig.add_trace(go.Scatter3d(
x=[path_x[0]], y=[path_y[0]], z=[path_z[0]],
mode='markers',
marker=dict(size=6, color='blue'),
name='Unfolded Start'
))
fig.add_trace(go.Scatter3d(
x=[path_x[-1]], y=[path_y[-1]], z=[path_z[-1]],
mode='markers',
marker=dict(size=6, color='green'),
name='Native Fold'
))
# Layout
fig.update_layout(
title='Protein Folding Funnel (Interactive)',
scene=dict(
xaxis_title='Conformation X',
yaxis_title='Conformation Y',
zaxis_title='Free Energy'
),
margin=dict(l=0, r=0, b=0, t=30)
)
fig.show()The native structure of a protein is determined solely by its amino acid sequence
The primary structure (amino acid sequence) encodes all the information needed for proper 3D folding
Folding occurs spontaneously under physiological conditions
The folded (native) state is the thermodynamic minimum — lowest free energy (ΔG)
\[ ΔS_{universe}≥0 \tag{1}\]
Polypeptide chains are flexible and have many possible conformations (high entropy)
When the chain folds:
Backbone and side chains adopt specific orientations
Mobility and conformational freedom are reduced
Local decrease in entropy within the protein <– This must be offset to not violate the 2nd law
Nonpolar side chains do not interact favorably with water
In the unfolded state:
Water molecules form structured clathrate shells around exposed hydrophobic residues to minimize disruption of the hydrogen-bonding network
Entropically unfavorable for water
Hydrophobic collapse ‘breaks’ clathrate shells
Freed water molecules regain full mobility <– This is the energy offset
This large increase in solvent entropy overcomes the local entropy loss from protein ordering
The caveat here is that there can be local minima within the topology
Incorrect disulfide bond formation, misaligned hydrophobic cores, aggregates of exposed hydrophobic residues
Cells use chaperones to help escape these traps by:
Providing isolated folding environments
Using ATP-driven cycles to unfold/refold misfolded proteins
A conceptual manifold where:
x-axis = Conformational degrees of freedom
y-axis = Free energy associated with each conformation
High-dimensional surface shaped by:
Steric constraints
Bond angles
Rotamer libraries
Electrostatics
Solvent interactions
Local roughness means proteins can settle in local energy minima
The Rastrigin function is a non-convex, multimodal function used to test optimization algorithms, especially in high-dimensional, noisy, or non-linear settings.
It’s useful because it has many local minima making it tricky for algorithms to find the true global minimum.
\[ f(\mathbf{x}) = A n + \sum_{i=1}^{n} \left[ x_i^2 - A \cos(2 \pi x_i) \right] \tag{2}\]
def rastrigin(x, y):
"""Compute the Rastrigin function."""
return 20 + 0.2 * x**2 + 0.1 * y**2 - 2 * np.cos(0.5 * np.pi * x) - 1.5 * np.cos(0.5 * np.pi * y)
# Generate x and y data points
x = np.linspace(-5.12, 5.12, 400)
y = np.linspace(-5.12, 5.12, 400)
X, Y = np.meshgrid(x, y)
Z = rastrigin(X, Y)
# Create the plot
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
# Plot the surface
surf = ax.plot_surface(X, Y, Z, cmap='magma', edgecolor='none', alpha=0.9)
# Add color bar
cbar = fig.colorbar(surf, ax=ax, shrink=0.6, pad=0.1)
cbar.set_label('Function value (Z)', rotation=270, labelpad=15)
# Axis labels and title
ax.set_title("Rastrigin Function Landscape (n=2)", fontsize=14, pad=20)
# Adjust view angle
ax.view_init(elev=45, azim=-60)
# Adjust layout to accommodate title
plt.subplots_adjust(top=0.88) # Makes room for the title
plt.show()
We just showed that:
Protein folding is a stochastic and thermodynamic process
The native fold corresponds (typically) to a global free energy minimum
The folding pathway is not deterministic (conformational space is sampled based on probabilistic rules and guided by thermodynamic constraints)
\[ P(folded state∣sequence,environment)∝e^{−G(x)/kT} \tag{3}\]
This tells that to solve this problem:
Traditional deterministic neural networks are not appropriate as they do not capture epistemic uncertainty
Bayesian Neural Networks (BNNs) learn distributions over weights and produce distributions over outputs
Why BNNs fit this problem well:
| Thermodynamics | BNN Solution |
|---|---|
| Energy landscape | Loss surface |
| Free energy minima | Likely parameter region |
| Boltzmann distribution | Weight posterior |
| Partition function | Bayesian model averaging |
| Conformational sampling | Posterior predictive sampling |
BNNs treat weight as random variables with a posterior:
\[ P(w∣data)∝P(data∣w)P(w) \]
The prediction takes inputs and integrates over all plausible models:
\[ P(y∣x,data)=∫P(y∣x,w)P(w∣data)dw \]
In Bayesian inference, the posterior is the probability distribution of parameters after observing data. Mathematically defined as:
\[ P(\theta \mid data) = \frac{P(data \mid \theta) \cdot P(\theta)}{P(data)} \]
Where:
\(θ\) = parameters (e.g., neural network weights)
\(P(θ)\) = prior: what we believed about the parameters before seeing the data
\(P(data∣θ)\) = likelihood: how likely the data is given those parameters
\(P(θ∣data)\) = posterior: updated belief about the parameters after seeing the data
\(P(data)\) = evidence or marginal likelihood (normalizing constant)
Traditional deep learning optimize a single point estimate (maximum likelihood or maximum a posteriori)
BNNs keep a distribution over possible weights:
\(P(w∣D)\) = posterior over weights given training data D
Predictions for a new input are made by marginalizing over this posterior:
\[ P(y∣x,D)=∫P(y∣x,w)⋅P(w∣D)dw \]
So the model accounts for uncertainty in the parameters
Neural Network = is a stack of operations that maps an input \(x\) to an output \(y\) such that:
\[ y=f(x;θ) \]
Where:
\(f\) is a composition of linear transformations and non-linear activation functions
\(θ\) is the canonical natural parameter
The outcome \(y\) is a distribution over the natural parameter rather than a point estimate

A raw sequence amino acid sequence is broken up into a strucutred machine readable format
Map each amino acid to an index from a vocabulary:
| Residue | Token ID |
|---|---|
| M | 2 |
| G | 5 |
| G | 5 |
| A | 1 |
| V | 20 |
| L | 11 |
| I | 9 |
| Token | Purpose |
|---|---|
<cls> |
Classification marker |
<mask> |
Masked language modeling training |
<pad> |
Padding for batch input |
So:
MGGAVLI –> [<cls> 2 5 5 1 20 11 9 <pad> <pad>]
AlphaFold starts with tokenized sequences and adds features:
| Input Feature | Description |
|---|---|
| Sequence Tokens | Raw residue identity (mapped to an embedding) |
| MSA Tokens | Rows from the multiple sequence alignment (shows evolutionary variation) |
| Pair Representation | Distances, co-variation, and geometric priors between residues |
We need to convert our tokens to vectors that account for every feature we embedded

Each amino acid gets turned into a vector composed of feature embeddings

Each feature adds a dimension to the vector representation:
Hydrophobicity
Charge
Evolutionary conservation
Sequence position
High dimensionality is required to capture enough features to accurately predict structure
AlphaFold embeds 384 dimensions
A Transformer is a neural network architecture designed to model relationships between elements in a sequence regardless of their position or distance from each other
Each Transformer is built from repeating units called blocks

Each block includes:
Learns what to focus on — for each token, it figures out which other tokens are relevant
Each head learns a different type of relationship
Allows the model to look at the whole sequence and capture contextual signals in parallel
Transformers don’t read left to right so we inject information about where each token is in the sequence
After attention figures out relationships, this step helps process and transform the data further
Helps in avoiding vanishing gradients
Residues that are far apart in sequence can be close in 3D space
A residue’s final position may depend strongly on a nonlocal partner
Tokens (amino acids in this case) are mapped to:
A query vector \(Q\)
A key vector \(K\)
A value vector \(V\)
These are learned projections of:
\[ Q=XW^{Q},\ K=XW^K,\ V=XW^V \]
Where \(X\) is the input and \(W^{Q,K,V}\) are trained weight matrices
Attention the is:
\[ Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V \]
Where \(\sqrt{d_k}\) is the dimension of the \(K\) and \(Q\) vector
Similarity scores: Compute dot product \(QK^T\) to measure how much one token “wants to attend to” another
Scale by \(\sqrt{d_k}\) to prevent large dot products (stabilizes gradients)
Softmax: Turns scores into probabilities (i.e., attention weights)
Weighted sum: Multiply weights with \(V\) to get the output weighted by importance
Dot products tell us in a mathematical sense how well two vectors align.
Larg dot products indicate a higher degree of alignment between the key and the query and smaller dot products indicate weaker alignment.
Example: Heatmap of how much each query attends to each key (synthetic data)
import numpy as np
import plotly.express as px
# Parameters
seq_len = 20
np.random.seed(42)
# Generate synthetic embeddings for each residue
residue_embeddings = np.random.rand(seq_len, 16)
# Dot product attention scores
attention_scores = residue_embeddings @ residue_embeddings.T
# Row-wise softmax
def softmax(x):
e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
return e_x / e_x.sum(axis=-1, keepdims=True)
attention_weights = softmax(attention_scores).astype(np.float64) # Force float64
# Plot
fig = px.imshow(
attention_weights,
labels=dict(x="Residue j (Key)", y="Residue i (Query)", color="Attention Weight"),
color_continuous_scale="Viridis",
title="Mocked Per-Residue Attention Map (Query to Key)"
)
fig.update_layout(
xaxis=dict(side="bottom"),
width=600,
height=500
)
fig.show()The attention matrix in Transformers is not necessarily symmetric because:
Each row \(i\) in the attention matrix represents how much the \(Q_ith\) attends to every other residue \(K_ith\)
Attention is Directional:
Attention(i → j) ≠ Attention(j → i)
That is: how much residue i attends to j is not the same as how much residue j attends to i
This directional nature comes from the Query-Key Dot Product:
Each row is calculated by taking the dot product between a query vector and all key vectors. Even though both are derived from the same embedding they are transformed with separate weight matrices \(W_Q\) and \(W_K\). So \(Q_i · K_j\) ≠ \(Q_j · K_i\)
Softmax is Applied Row-wise: The attention matrix is normalized across each row, not symmetrically. This makes each row a valid probability distribution (summing to 1) but doesn’t preserve symmetry
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
# Parameters
seq_len = 20
np.random.seed(42)
# Generate synthetic embeddings
residue_embeddings = np.random.rand(seq_len, 16)
# Raw dot product scores (symmetric)
raw_scores = residue_embeddings @ residue_embeddings.T
# Softmax function
def softmax(x):
e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
return e_x / e_x.sum(axis=-1, keepdims=True)
# Compute attention weights
attention_weights = softmax(raw_scores)
# Symmetrize
symmetric_attention = (attention_weights + attention_weights.T) / 2
# Create subplots
fig = sp.make_subplots(
rows=1, cols=3,
subplot_titles=(
"Raw Dot Product Scores (Symmetric)",
"Attention Weights (Asymmetric)",
"Symmetrized Attention (Post-hoc)"
),
horizontal_spacing=0.1
)
fig.add_trace(go.Heatmap(z=raw_scores, colorscale='Viridis', colorbar=dict(title='Score', x=0.27)), row=1, col=1)
fig.add_trace(go.Heatmap(z=attention_weights, colorscale='Viridis', colorbar=dict(title='Weight', x=0.63)), row=1, col=2)
fig.add_trace(go.Heatmap(z=symmetric_attention, colorscale='Viridis', colorbar=dict(title='Weight', x=1)), row=1, col=3)
fig.update_layout(
title_text="Effect of Softmax on Attention Symmetry"
)
fig.show()Left: Symmetric raw dot product scores
Middle: Attention weights after row-wise softmax
Right: Post-hoc symmetrized attention weights —> averaging the softmax matrix with its transpose
Single attention head:
Each token (or residue) computes attention scores with all other tokens
Generates a weighted average of their values
But this is only one type of relationship evaluated
Multi-head attention lets the model:
Attend to different types of relationships in parallel:
local context (by-residue attention)
global structure (coevolutionary attention)
biochemical properties (geometric and surface charge attention)
Mathematically for a token embedding of dimension \(d\) we compute \(h\) heads such that:
\[ head_i = Attention(Q W^Q_i, K W^K_i, V W^V_i) \]
with projection matrices \(W_i^Q, W_i^K, W_i^V \in \mathbb{R}^{d \times d_h}\) and \(d_h = d / h\) for dimensional consistency
∴ Multi-head Attention can be stated as:
\[ MultiHead(Q, K, V) = Concat(head_1, ..., head_h)W^O \]
Where \(W^O \in \mathbb{R}^{d \times d}\) is the final linear projection that combines all heads into a single
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
# Set a random seed for reproducibility
np.random.seed(42)
# Define a simple list of tokens and assign each a random embedding.
tokens = ["The", "cat", "sat"]
d_model = 4 # Embedding dimension
n_tokens = len(tokens)
# Generate random embeddings for each token (shape: n_tokens x d_model)
embeddings = np.random.rand(n_tokens, d_model)
# For this demonstration, we set:
# Query (Q) = Key (K) = Value (V) = embeddings.
Q = embeddings
K = embeddings
V = embeddings
# Compute raw attention scores: Q dot K^T
# Optionally, we scale the dot products by the square root of the dimension (√d_model)
scores = np.dot(Q, K.T) / np.sqrt(d_model)
# Define a softmax function that works row-wise for numerical stability.
def softmax(x):
# subtract the row max for numerical stability
exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
# Compute the attention weights by applying the softmax function row-wise.
attention_weights = softmax(scores)
# Compute the final output as the weighted sum of values.
output = np.dot(attention_weights, V)
# Function to update the visualization based on the interactive slider step.
def update(step):
plt.figure(figsize=(8, 5))
plt.clf() # clear the figure for a fresh plot
if step == 0:
# Step 0: Show the input embeddings.
plt.title("Step 0: Input Embeddings (Q, K, V)")
text = f"Tokens: {', '.join(tokens)}\n\n"
text += "Embeddings:\n" + np.array2string(embeddings, precision=2, separator=', ') + "\n\n"
text += "For simplicity, we use: Q = K = V = Embeddings"
plt.text(0.05, 0.5, text, fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
elif step == 1:
# Step 1: Compute dot product scores Q.dot(K^T)
plt.title("Step 1: Compute Dot Product Scores (Q dot K^T)")
# Display the matrix as an image
im = plt.imshow(scores, cmap='viridis', interpolation='none')
plt.colorbar(im)
plt.xlabel("Key Tokens")
plt.ylabel("Query Tokens")
plt.xticks(ticks=range(n_tokens), labels=tokens)
plt.yticks(ticks=range(n_tokens), labels=tokens)
plt.tight_layout()
elif step == 2:
# Step 2: Apply Softmax to obtain attention weights.
plt.title("Step 2: Apply Softmax to the Scores")
im = plt.imshow(attention_weights, cmap='viridis', interpolation='none')
plt.colorbar(im)
plt.xlabel("Key Tokens")
plt.ylabel("Query Tokens")
plt.xticks(ticks=range(n_tokens), labels=tokens)
plt.yticks(ticks=range(n_tokens), labels=tokens)
plt.tight_layout()
elif step == 3:
# Step 3: Compute the final output (weighted sum of V)
plt.title("Step 3: Weighted Sum of V (Final Output)")
# Show the output matrix. Here each row corresponds to a token's new representation.
im = plt.imshow(output, cmap='viridis', interpolation='none')
plt.colorbar(im)
plt.xlabel("Embedding Dimensions")
plt.ylabel("Token Index")
plt.yticks(ticks=range(n_tokens), labels=tokens)
plt.tight_layout()
plt.show()
# Create an interactive slider for stepping through the visualization.
widgets.interact(update, step=widgets.IntSlider(min=0, max=3, step=1, value=0, description='Step'))<function __main__.update(step)>
# Settings
num_heads = 4
seq_len = 10
aa_labels = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I']
# Generate synthetic attention weights
np.random.seed(42)
attention_heads = [np.random.dirichlet(np.ones(seq_len), size=seq_len) for _ in range(num_heads)]
# Global color scale limits
vmin = 0
vmax = max(np.max(h) for h in attention_heads)
# Create 2x2 subplot grid
fig = make_subplots(
rows=2, cols=2,
subplot_titles=[f'Head {i + 1}' for i in range(num_heads)],
horizontal_spacing=0.1,
vertical_spacing=0.1,
)
# Add heatmaps without colorbars
for i, attn in enumerate(attention_heads):
row, col = divmod(i, 2)
fig.add_trace(
go.Heatmap(
z=attn,
x=aa_labels,
y=aa_labels,
colorscale='Viridis',
zmin=vmin,
zmax=vmax,
showscale=False # Disable individual colorbars
),
row=row + 1,
col=col + 1
)
# Add a dummy heatmap just for global colorbar
fig.add_trace(
go.Heatmap(
z=[[vmin, vmax]], # Dummy 2-value array
colorscale='Viridis',
showscale=True,
colorbar=dict(
title='Attention weight',
x=1.02, # Move it outside grid
len=0.8 # Control bar length
)
),
row=1, col=2 # Dummy location — doesn't render visibly
)
# Layout
fig.update_layout(
height=800,
width=900,
title_text='Simulated Multi-Head Attention (Amino Acid Labels)',
showlegend=False,
margin=dict(r=100)
)
fig.update_xaxes(title='Key (Attending to)', tickfont_size=10)
fig.update_yaxes(title='Query', tickfont_size=10)
fig.show()After a residue has “listened” to others via attention, the FFN lets it process what it heard.
It applies a non-linear transformation to refine the features, helping the model learn non-linear patterns like:
buried vs. exposed residues
loops vs. helices
biochemical motifs
Each token vector x∈Rdx∈Rd goes through:
x→Linear1→Activation (ReLU or GELU)→Linear2x→Linear1→Activation (ReLU or GELU)→Linear2
Mathematically:
FFN(x)=W2⋅ϕ(W1x+b1)+b2FFN(x)=W2⋅ϕ(W1x+b1)+b2
W1W1: Expands the hidden size (e.g., from 256 → 1024)
ϕϕ: Activation function (e.g., GELU in AlphaFold)
W2W2: Projects back down to original size
This is applied to each token/residue independently.
High dimensional relationships are non-linear and the way we introduce non-linear components to NNs is with activation functions
Without a nonlinear activation function, a neural network reduces to a single-layer linear model, no matter how many layers you stack
Each neuron within the network performs the linear operations and then an activation function is applied to the output
That is:
\[ a^{l}=ϕ(W^{l}x^{l−1}+b^{l}) \]
Where:
\(ϕ\) is the activation function
\(l\) is the layer
\(W\) and \(b\) are weights and biases (the linear component that the neuron is computing)
and \(x^{l-1}\) is the output of the previous layer
# Activation functions
def relu(x): return np.maximum(0, x)
def leaky_relu(x, alpha=0.1): return np.where(x > 0, x, alpha * x) # stronger leak for clarity
def sigmoid(x): return 1 / (1 + np.exp(-x))
def tanh(x): return np.tanh(x)
def swish(x): return x * sigmoid(x)
def gelu(x): return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))
def elu(x, alpha=1.0): return np.where(x >= 0, x, alpha * (np.exp(x) - 1))
def softplus(x): return np.log1p(np.exp(x))
# Input
x = np.linspace(-5, 5, 1000)
# Activation dictionary
activations = {
"ReLU": relu,
"Leaky ReLU": leaky_relu,
"Sigmoid": sigmoid,
"Tanh": tanh,
"Swish": swish,
"GELU": gelu,
"ELU": elu,
"Softplus": softplus
}
# Plot
fig, axes = plt.subplots(2, 4, figsize=(10, 3), constrained_layout=True)
axes = axes.flatten()
for i, (name, func) in enumerate(activations.items()):
ax = axes[i]
ax.plot(x, func(x), linewidth=2)
ax.set_title(name, fontsize=14)
ax.axhline(0, color='black', linewidth=0.5, linestyle='--')
ax.axvline(0, color='black', linewidth=0.5, linestyle='--')
ax.grid(True, linestyle=':', alpha=0.7)
ax.set_xlim([-5, 5])
ax.set_ylim([-1.25, 4])
ax.tick_params(labelsize=10)
# Remove unused axes
for j in range(len(activations), len(axes)):
fig.delaxes(axes[j])
plt.show()
Currently AlphaFold uses GELU
\[ \text{GELU}(x) = x \cdot \Phi(x) = x \cdot \frac{1}{2} \left(1 + \operatorname{erf}\left( \frac{x}{\sqrt{2}} \right) \right) \]
GELU outputs close to linearity for positive inputs, but smoothly suppresses negatives (without hard zeroing like ReLU)
Let’s let this input pass through, but only scaled by how likely it is to be positive in a Gaussian sense
It’s stochastically smooth
Stochastic: using probability (from a Gaussian) to scale the signal (instead of a binary on/off)
Smooth: infinitely differentiable which gives smoother gradients
If a residue has weak but not insignificant influence on folding a hard cutoff like ReLU might throw away that influence
GELU respects that influence but in a manner proportional to how confident the model is in the true leverage of the residue
from scipy.stats import norm
# GELU definition
def gelu(x):
return 0.5 * x * (1 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715 * x**3)))
# Input range reflecting typical normalized activations
x = np.linspace(-4, 4, 1000)
y = gelu(x)
normal_pdf = norm.pdf(x) * 4 # scaled normal curve for context
# Plot
plt.figure(figsize=(8, 4))
plt.plot(x, y, label='GELU Activation', color='blue', linewidth=2)
plt.plot(x, normal_pdf, color='gray', linestyle='--', label='Standard Normal (scaled)', alpha=0.6)
# Axis and styling
plt.axhline(0, color='black', linestyle='--', linewidth=0.5)
plt.axvline(0, color='black', linestyle='--', linewidth=0.5)
plt.title("GELU Activation on Normalized Inputs (AlphaFold Context)", fontsize=16)
plt.xlabel("Input (normalized, mean=0, std=1)", fontsize=13)
plt.ylabel("Output", fontsize=13)
plt.xlim([-4, 4])
plt.ylim([-1.5, 4])
plt.grid(True, linestyle=':', alpha=0.6)
plt.legend(fontsize=12)
plt.tight_layout()
plt.fill_between(x, y, where=(x > -2) & (x < 2), color='blue', alpha=0.1, label='Smooth transition')
plt.show()

Purpose: Extract feature representations crucial for predicting protein structure.
Input Representations:
Multiple Sequence Alignment (MSA) Representation:
Encodes evolutionary constraints by capturing co-evolutionary signals across sequences.
Illustrates intra-sequence relationships, where correlated mutations indicate evolutionary pressure.
Pair Representation:
Captures physical proximity and geometric relationships among residue pairs.
Focuses on inter-sequence interactions within a single protein chain.
Captures Co-evolutionary Constraints:
Mathematical Representation of Attention in MSA:
Self-Attention Equation:
AttentionMSA(Q,K,V)=softmax (QKTdk)VAttentionMSA(Q,K,V)=softmax(dkQKT)V
Notation:
QQ, KK, VV: Query, Key, and Value matrices derived from the MSA features.
dkdk: Dimensionality of the key vectors.
Models Geometric and Proximity Constraints:
Example Equation for Modeling Pair Distances:
Distance Function:
dij=∥pi−pj∥2dij=∥pi−pj∥2
Where:
Alignment with Statistical Physics and Information Theory:
Assumption: Sequence co-variation indirectly constrains the protein fold.
Interpretation:
Statistical Physics View:
Information Theory View:
The Evoformer is the core feature extraction module within AlphaFold, responsible for learning both intra-sequence and inter-sequence relationships by operating jointly over two distinct but interacting representations: the Multiple Sequence Alignment (MSA) representation and the Pair representation. The MSA representation captures co-evolutionary constraints — residues that mutate in a correlated fashion across evolutionary space — while the Pair representation models the physical proximity and geometric relationships between residue pairs within a single sequence. Crucially, the Evoformer uses specialized attention blocks to allow bidirectional information flow between these two representations.
Architecturally, the Evoformer is composed of stacked Transformer-like blocks but diverges from standard NLP transformers in its inductive biases. The model explicitly factors evolutionary context (MSA attention) from spatial/geometric context (Pair attention), which allows it to avoid overfitting to local sequence patterns and instead learn representations that generalize across structural homology. The design reflects the underlying assumption that sequence co-variation provides indirect but highly informative constraints on a protein’s physical fold — an insight well-aligned with principles from statistical physics and information theory.

Final stage of AlphaFold’s architecture — converts Evoformer outputs into atomic-level 3D coordinates.
Learns spatial geometry (distance, angles, torsions) in a physically plausible way.
Integrates outputs from the Pair Representation (inter-residue features) to build a 3D model that is differentiable and end-to-end trainable.
Single Representation: 1D features from Evoformer per residue.
Pair Representation: 2D features between residue pairs from Evoformer.
(Optional) Template structures, if available, provide geometric priors.
Key idea: Encodes geometric relationships in 3D without being affected by rotations or translations.
Ensures SE(3)-invariance, meaning predictions are geometry-aware and physically valid.
Let:
xi∈R3xi∈R3: current estimate of the 3D position of residue ii
fifi: feature vector of residue ii
R,tR,t: rotation and translation matrices
Then IPA computes attention using 3D points as queries/keys:
IPAij=softmax(−∥Rxi+t−xj∥2+fi⊤fj)IPAij=softmax(−∥Rxi+t−xj∥2+fi⊤fj)
Learned attention is based on:
3D distance between residues (invariant)
Dot-product of features (as in standard attention)
Updated features refine the estimated 3D position while respecting invariant geometry.
Predicts the N–Cα–C backbone frame using local torsion angles.
The model first predicts:
Bond lengths and angles
Torsion angles: ϕ (phi), ψ (psi), ω (omega)
Using standard bond geometry rules:
Predict the position of each atom:
r⃗C=r⃗Cα+ℓ⋅u⃗(θ,ϕ)rC=rCα+ℓ⋅u(θ,ϕ)
where ℓℓ is bond length and u⃗u is a unit vector computed from predicted angles.
Backbone atoms are positioned using rigid frame transformations.
Uses predicted chi angles (χ₁, χ₂, …) to build sidechain conformations.
Employs a rotamer library prior to ensure biological plausibility.
FAPE (Frame Aligned Point Error):
Measures deviation between predicted and true atomic positions in local coordinate frames.
Encourages both global accuracy and local consistency.
FAPE(xpred,xtrue)=∥Tpred−1(xtrue)−xpred∥2FAPE(xpred,xtrue)=∥Tpred−1(xtrue)−xpred∥2
Tpred−1Tpred−1: transforms ground truth into predicted frame.
This makes the loss invariant to global rotation/translation.
Clash Loss: Penalizes atomic overlaps.
Bond Length / Angle Losses: Enforces chemical feasibility.
Full 3D atomic coordinates for:
Backbone atoms (N, Cα, C, O)
Sidechains (all heavy atoms)
Can be directly rendered in PDB format.
The Structure Module serves as AlphaFold’s differentiable geometric decoder — translating high-dimensional latent representations into explicit atomic coordinates within a physically plausible 3D space. Unlike the Evoformer, which operates in sequence/pair latent space, the Structure Module introduces geometric reasoning directly by leveraging Invariant Point Attention (IPA). This mechanism allows the model to attend over learned spatial frames while maintaining equivariance to global rotations and translations — an architectural choice that aligns with the SE(3)-invariance of protein structure space.
Importantly, the Structure Module does not perform a single-pass prediction but instead refines atomic positions iteratively, akin to gradient descent through an implicit energy landscape. At each iteration, the model integrates local backbone geometry, torsion angle predictions, and long-range distance constraints derived from the Evoformer representations. This design mirrors energy minimization strategies used in classical structure prediction (e.g., Rosetta), but here it is fully learned end-to-end without reliance on pre-specified force fields.
AlphaFold’s input pipeline reflects a hybrid of classical bioinformatics preprocessing and learned representation modeling. The primary input is a single protein sequence; however, its predictive power is substantially enhanced through evolutionary feature augmentation. The model leverages Multiple Sequence Alignments (MSAs) to capture co-evolutionary constraints — effectively sampling from the implicit mutational landscape shaped by evolutionary pressure.
In addition to MSAs, AlphaFold optionally incorporates structural templates — homologous protein structures identified through sequence or structural similarity search. While template usage can improve accuracy, particularly in regions of sparse evolutionary signal, AlphaFold’s architecture is explicitly designed to operate in template-free settings, relying instead on its learned inductive biases and MSA-derived features. This design choice speaks to the model’s generalizability and robustness across varying levels of input information density.
| Metric | Definition | Biophysical Analogy | Interpretation |
|---|---|---|---|
| pLDDT | Local confidence per residue \(p_i∈[0,100]\) | Inverse local flexibility | High pLDDT \(⇒\)deep local free energy minimum |
| PAE | Pairwise error between residues \(i,j\) | Domain-domain flexibility | High PAE \(⇒\) flat energy surface between \(i,j\) |
Protein folding is fundamentally stochastic — a search for the global minimum of F(X)F(X) over a rugged, high-dimensional landscape.
Regions with:
High pLDDT → Steep, well-defined minima
Low pLDDT → Shallow basins or flat regions (intrinsic disorder)
AlphaFold’s structural output can be conceptualized within a probabilistic framework as a posterior predictive distribution:
\[ P(\mathbf{X} \mid S, \theta) \]
Where:
\({X}\) = {\(x_1\), \(x_2\), …, \(x_n\)} are the predicted Cartesian coordinates of all atoms
\(S\) = {\(s_1\) , \(s_2\), …, \(s_n\)} is the amino acid sequence
\(\theta\) are the learned parameters from training ($W_i$)
The predicted Local Distance Difference Test (pLDDT) score is a continuous value from 0 to 100 assigned to each residue in the predicted structure. This score is designed as a learned proxy for the actual LDDT-Cα metric — a well-established structure comparison metric based on the preservation of local distances between pairs of residues..
Empirically, residues with pLDDT > 90 correspond to regions with expected RMSD errors below 1 Å when benchmarked against experimental structures. Lower pLDDT values often correspond to intrinsically disordered regions (IDRs), unresolved loops, or regions with weak evolutionary constraints. The key statistical insight is that pLDDT is not a measure of physical flexibility per se — but rather a learned error estimate of model uncertainty conditioned on the input sequence and evolutionary features.
The Predicted Aligned Error (PAE) matrix provides a pairwise error estimate between all residues i,j in the sequence, representing the expected deviation (in Å) between the predicted and true positions of residues i and j, assuming optimal alignment of the surrounding structure.
Formally, the PAE matrix approximates:
\[ PAE(i,j) \approx \mathbb{E}\left[ \|x_i^{\text{pred}} - x_i^{\text{true}}\| \; \middle| \; \text{alignment}(x_{\sim i,j}) \right] \]
Where \(x_i^{\text{pred}}\) and \(x_i^{\text{true}}\) are the predicted and true coordinates of residue \(i\), and \(\text{alignment}(x_{\sim i,j})\) denotes optimal structural alignment excluding residues \(i\) and \(j\).
Statistically, the PAE provides a localized posterior estimate of prediction error, conditioned on the alignment of other regions in the structure.
The PAE is especially informative for assessing inter-domain flexibility or uncertainty in relative domain positioning. It allows users to distinguish between confident intra-domain structure (low PAE) and uncertain inter-domain orientation (high PAE), even when local pLDDT scores are uniformly high.
| Metric | Scale | Interpretation | Statistical Analogy |
|---|---|---|---|
| pLDDT | Per-residue | Local model certainty | Inverse variance / confidence interval width |
| PAE | Residue-residue pair | Expected pairwise error under alignment | Conditional posterior mean error |
Both metrics arise from auxiliary heads trained within AlphaFold’s architecture to minimize bespoke loss functions that correlate with experimental error metrics (RMSD, TM-score). The pLDDT and PAE values are thus not true Bayesian posterior variances but learned error estimators calibrated empirically on large structure databases (PDB).