This tutorial walks through a complete workflow for processing maize genetic data, focusing on:
First, let’s install and load the necessary packages:
# Install required packages if not already installed
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!requireNamespace("Biostrings", quietly = TRUE))
BiocManager::install("Biostrings")
if (!requireNamespace("tidyverse", quietly = TRUE))
install.packages("tidyverse")
# Load packages
library(Biostrings) # For FASTA manipulation
library(tidyverse) # For data manipulation
Let’s set up our directory structure and read our input files:
# Create a project directory if it doesn't exist
project_dir <- "tree_plotting"
if (!dir.exists(project_dir)) {
dir.create(project_dir)
}
# Set file paths within the project directory
aln_file <- file.path(project_dir, "hpc1_aligned.fasta")
metadata_file <- file.path(project_dir, "seqid_label.csv")
# Read the mapping table with taxonomic metadata
metadata <- read.csv(metadata_file)
# Display the first few rows to understand the structure
head(metadata)
# Read the FASTA alignment using Biostrings
original_alignment <- readDNAStringSet(aln_file)
# View information about the alignment
cat("Number of sequences:", length(original_alignment), "\n")
cat("Sequence length:", width(original_alignment)[1], "bp\n")
# Display the first few sequence names
head(names(original_alignment))
Now we’ll prepare our sequence names and map them to meaningful labels:
# Extract sequence IDs from the alignment and remove description fields
seq_ids <- names(original_alignment)
trimmed_ids <- gsub("\\s.*", "", seq_ids, perl=TRUE)
# Verify sequence ID counts match
cat("Original sequence count:", length(original_alignment), "\n")
cat("Trimmed IDs count:", length(trimmed_ids), "\n")
# Assign the trimmed IDs to the alignment object
names(original_alignment) <- trimmed_ids
# Create a mapping dataframe that connects sequence IDs with taxonomic labels
# We also keep track of the original order in the alignment
name_swap <- data.frame(seqid = trimmed_ids) %>%
mutate(aln_order = row_number()) %>%
inner_join(metadata)
# Filter to include only sequences that exist in our alignment
name_swap <- name_swap %>%
dplyr::filter(seqid %in% names(original_alignment))
# Report on mapping success
cat("Sequences successfully mapped:", nrow(name_swap), "\n")
# Filter the alignment to keep only sequences we have metadata for
filtered_alignment <- original_alignment[name_swap$seqid]
# Now replace the technical IDs with meaningful labels from label_2 column
names(filtered_alignment) <- name_swap$label_2
# Store counts for later use
n_taxa <- length(filtered_alignment)
n_pos <- width(filtered_alignment)[1]
# Examine our newly labeled sequences
head(names(filtered_alignment))
Next, we’ll trim the alignment and identify the I211V variant position:
# Load GenomicRanges for efficient sequence manipulation
library(GenomicRanges)
# Define trimming ranges (starting from position 2226 to the end)
trimmedRanges <- GRanges(
seqnames = names(filtered_alignment),
ranges = IRanges(
start = rep(2226, n_taxa),
end = rep(n_pos, n_taxa)
)
)
# Perform the trimming operation
trimmed_alignment <- filtered_alignment[trimmedRanges]
# Verify the trimmed alignment
cat("Trimmed alignment length:", width(trimmed_alignment)[1], "bp\n")
head(names(trimmed_alignment))
# Find the position of the I211V variant using a specific sequence pattern
# The pattern "ATCACCCGC" corresponds to the I211V region in the B73 reference
I211V_B73 <- vmatchPattern("ATCACCCGC", trimmed_alignment)[[1]]
# Extract the variant position for all sequences
I211V_range <- GRanges(
seqnames = names(filtered_alignment),
ranges = IRanges(
start = rep(start(I211V_B73), n_taxa),
end = rep(start(I211V_B73), n_taxa)
)
)
# Extract just the nucleotide at the variant position from each sequence
I211V_aln <- trimmed_alignment[I211V_range]
I211V_matrix <- as.matrix(I211V_aln)
# Replace gap characters with NA for cleaner data handling
I211V_matrix[I211V_matrix == "-"] <- NA
# Add the variant information to our metadata dataframe
name_swap$I211V <- factor(as.vector(I211V_matrix))
# Check the distribution of genotypes at this position
table(name_swap$I211V, useNA = "ifany")
Let’s save our relabeled and trimmed alignment:
# Define output file path
output_alignment <- file.path(project_dir, "hpc1_nice_labels.fasta")
# Write the renamed alignment to a new file
writeXStringSet(trimmed_alignment, output_alignment, format="fasta")
cat("Renamed alignment written to:", output_alignment, "\n")
Now we’ll build and visualize a phylogenetic tree from our processed alignment:
# Load packages for phylogenetic analysis
library(ape)
library(phangorn)
library(phytools)
# Read the renamed alignment for phylogenetic analysis
hpc1_aln <- read.dna(output_alignment, format="fasta")
# Check labels in the alignment
labels(hpc1_aln)
# Convert to phangorn's phyDat format for phylogenetic analysis
hpc1_phyDat <- phyDat(hpc1_aln, type = "DNA", levels = NULL)
# Calculate distance matrix using JC69 model
# Note: You could use modelTest() to find the best model for your data
# mt <- modelTest(hpc1_phyDat)
dna_dist <- dist.ml(hpc1_phyDat, model="JC69")
# Ensure founder ancestry is properly formatted as a factor
# Note how we specify the order of levels to control the color scheme
name_swap$founder_ancestry <- factor(name_swap$founder_ancestry, levels=c("Recurrent", "Donor"))
# Build UPGMA tree and ladderize it for better visualization
hpc1_UPGMA <- ladderize(upgma(dna_dist), right = FALSE)
# Simple plot to check the tree
plot(hpc1_UPGMA, cex = 0.7, main = "UPGMA Tree Before Rotation")
To position B73 at the top of our visualization, we need a specialized rotation function:
# Function to rotate a tree to position a specific tip at the top
pivot_on <- function(x, target_tip){
target_tip_label <- target_tip
tree <- x
# Make a copy to modify, keeping the original intact
rotated_tree <- tree
# Check if the target tip exists in the tree
if (!target_tip_label %in% rotated_tree$tip.label) {
stop("Target tip '", target_tip_label, "' not found in the tree.")
}
# Get the total number of tips
ntips <- Ntip(rotated_tree)
# Get the node number corresponding to the target tip
target_tip_node <- which(rotated_tree$tip.label == target_tip_label)
# Use a reference node to establish a path
# Here we're using the last tip as our reference
reference_node <- ntips
# Find the path from the reference node to our target tip
node_path <- nodepath(rotated_tree, from = reference_node, to = target_tip_node)
# We only need to consider rotating internal nodes on the path
# Internal nodes have numbers greater than ntips
internal_nodes_on_path <- node_path[node_path > ntips]
# Iterate through each node and rotate if necessary
if (length(internal_nodes_on_path) > 0) {
# Process nodes in order from root towards tip
nodes_to_check <- sort(internal_nodes_on_path)
for (node_to_rotate in nodes_to_check) {
# Find the direct children of this internal node
children <- rotated_tree$edge[rotated_tree$edge[, 1] == node_to_rotate, 2]
# Check if we have exactly two children (bifurcating tree assumption)
if (length(children) != 2) {
warning("Node ", node_to_rotate, " is not bifurcating. Rotation might not work as expected.")
next # Skip rotation for non-bifurcating nodes
}
# Find which child is on the path towards the target tip
current_node_index_in_path <- which(node_path == node_to_rotate)
child_on_path <- node_path[current_node_index_in_path + 1]
# If the child on the path is the first child, rotate the node
# This will make our target tip appear higher in the visualization
if (child_on_path == children[1]) {
rotated_tree <- rotate(rotated_tree, node = node_to_rotate)
}
}
}
# Return the rotated tree
return(rotated_tree)
}
Now let’s apply our rotation function and create the final plot:
# Ensure row names in our metadata match the tree's tip labels
rownames(name_swap) <- name_swap$label_2
# Rotate the tree to put the first tip (B73 reference) at the top
out_tree <- pivot_on(hpc1_UPGMA, hpc1_UPGMA$tip.label[1])
# Create a function to generate our visualization
plot_output <- function(file, tree, data){
# Rename founder_ancestry column to ancestry for clarity
data <- data %>% dplyr::rename(ancestry = "founder_ancestry")
# Define color palette for our categories
pal <- c("gold", "purple3", "tomato", "royalblue")
names(pal) <- c("Recurrent", "Donor", "A", "G")
# Extract just the columns we want to visualize
X <- data[, c("ancestry", "I211V")]
rownames(X) <- rownames(data)
# Create the phylogenetic tree with colored dots showing metadata
pdf(file = file, height = 13, width = 7)
dotTree(tree, X, colors = pal,
labels = TRUE, # Show taxon labels
fsize = 0.7, # Font size
border = "transparent", # No border around dots
cex.dot = 2) # Size of dots
dev.off()
}
# Generate the visualization
output_plot <- file.path(project_dir, "hpc1_UPGMA_dot_tree.pdf")
plot_output(output_plot, out_tree, name_swap)
cat("Plot saved to:", output_plot, "\n")
# Save the tree in Newick format for future use
output_tree <- file.path(project_dir, "hpc1_UPGMA.tre")
write.tree(out_tree, file = output_tree)
cat("Tree saved to:", output_tree, "\n")
# Display the tree in the current R session (if running interactively)
plot(out_tree, main = "Rotated Tree with B73 at Top", cex = 0.7)
Let’s examine the complete file structure of our project:
## tree_plotting/
## ├── hpc1_aligned.fasta # Input: Original alignment file
## ├── seqid_label.csv # Input: Metadata mapping technical IDs to meaningful names
## ├── hpc1_nice_labels.fasta # Output: Alignment with renamed sequences
## ├── hpc1_UPGMA.tre # Output: Phylogenetic tree in Newick format
## └── hpc1_UPGMA_dot_tree.pdf # Output: Visualization with ancestry and variant information
This tutorial demonstrated how to:
These skills are widely applicable to comparative genomics and population genetics studies in maize and other organisms.
vmatchPattern
function fails, verify that the pattern
sequence exists in your alignment. You may need to adjust the pattern or
the starting position.