#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("DESeq2")gene_expression_level_analysis_Abru
Analyzing Argiope_bruennichi RNA-seq data with DESeq2, complete matrix from Kallisto
This quarto doc include the detailed info to perform gene-level differential expression analysis using DESeq2 on Argiope brunnechi RNA-seq data. The focus is on identifying chemosensory genes (IRs, GRs, GPCRs) differentially expressed across tissues (legs, pedipalps, mouthparts) and sexes. The analysis includes normalization, dispersion estimation, and visualization of key contrasts.
Installation and required libraries
#if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("rhdf5")#if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("ComplexHeatmap")library(GenomicFeatures)Warning: package 'GenomicFeatures' was built under R version 4.3.3
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Warning: package 'S4Vectors' was built under R version 4.3.2
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
expand.grid, I, unname
Loading required package: IRanges
Loading required package: GenomeInfoDb
Warning: package 'GenomeInfoDb' was built under R version 4.3.3
Loading required package: GenomicRanges
Loading required package: AnnotationDbi
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
library(ggplot2)Warning: package 'ggplot2' was built under R version 4.3.3
library(reshape)Warning: package 'reshape' was built under R version 4.3.3
Attaching package: 'reshape'
The following objects are masked from 'package:S4Vectors':
expand, rename
library(dplyr)
Attaching package: 'dplyr'
The following object is masked from 'package:reshape':
rename
The following object is masked from 'package:AnnotationDbi':
select
The following object is masked from 'package:Biobase':
combine
The following objects are masked from 'package:GenomicRanges':
intersect, setdiff, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(tidyverse)Warning: package 'tibble' was built under R version 4.3.3
Warning: package 'purrr' was built under R version 4.3.3
Warning: package 'lubridate' was built under R version 4.3.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.4 ✔ tibble 3.3.0
✔ purrr 1.0.4 ✔ tidyr 1.3.1
✔ readr 2.1.5
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::%within%() masks IRanges::%within%()
✖ dplyr::collapse() masks IRanges::collapse()
✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
✖ dplyr::desc() masks IRanges::desc()
✖ tidyr::expand() masks reshape::expand(), S4Vectors::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::first() masks S4Vectors::first()
✖ dplyr::lag() masks stats::lag()
✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
✖ dplyr::rename() masks reshape::rename(), S4Vectors::rename()
✖ lubridate::second() masks S4Vectors::second()
✖ lubridate::second<-() masks S4Vectors::second<-()
✖ dplyr::select() masks AnnotationDbi::select()
✖ dplyr::slice() masks IRanges::slice()
✖ lubridate::stamp() masks reshape::stamp()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DESeq2)Warning: package 'DESeq2' was built under R version 4.3.3
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats
Warning: package 'matrixStats' was built under R version 4.3.3
Attaching package: 'matrixStats'
The following object is masked from 'package:dplyr':
count
The following objects are masked from 'package:Biobase':
anyMissing, rowMedians
Attaching package: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
colWeightedMeans, colWeightedMedians, colWeightedSds,
colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
rowWeightedSds, rowWeightedVars
The following object is masked from 'package:Biobase':
rowMedians
library(tximport)
library(tximportData)
library(apeglm)
library("pheatmap")Warning: package 'pheatmap' was built under R version 4.3.3
library(rhdf5)Warning: package 'rhdf5' was built under R version 4.3.2
library(RColorBrewer)
library(ComplexHeatmap)Loading required package: grid
========================================
ComplexHeatmap version 2.18.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
If you use it in published research, please cite either one:
- Gu, Z. Complex Heatmap Visualization. iMeta 2022.
- Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
genomic data. Bioinformatics 2016.
The new InteractiveComplexHeatmap package can directly export static
complex heatmaps into an interactive Shiny app with zero effort. Have a try!
This message can be suppressed by:
suppressPackageStartupMessages(library(ComplexHeatmap))
========================================
! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
in the original pheatmap() are identically supported in the new function. You
can still use the original function by explicitly calling pheatmap::pheatmap().
Attaching package: 'ComplexHeatmap'
The following object is masked from 'package:pheatmap':
pheatmap
library(circlize)Warning: package 'circlize' was built under R version 4.3.3
========================================
circlize version 0.4.16
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: https://jokergoo.github.io/circlize_book/book/
If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization
in R. Bioinformatics 2014.
This message can be suppressed by:
suppressPackageStartupMessages(library(circlize))
========================================
library(cluster)Warning: package 'cluster' was built under R version 4.3.3
library(factoextra)Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
PCA Before Running DESeq2: can I Collapse technical replicates?
It’s a good idea to perform an exploratory analysis like Principal Component Analysis (PCA). This helps us understand the structure of our data — and, in this case, assess whether we can collapse technical replicates.
If your technical replicates (e.g., repeated sequencing of the same library) cluster very tightly together, that’s a good sign. It means they are consistent, and you may choose to collapse them into a single sample to simplify downstream analysis.
Collapsing technical replicates means summing or averaging the expression values (ideally raw counts) from multiple runs of the same sample. DESeq2 even has a built-in function for this:
collapseReplicates()
Important: Only use this function for technical replicates, not biological replicates (i.e., samples from different individuals).
What kind of data should I use for PCA?
While DESeq2 requires raw (unnormalized) counts, PCA is more flexible.
You can use:
- TPMs (Transcripts Per Million) – good for visualizing expression patterns and clustering.
- log2(TPM + 1) – common transformation to stabilize variance and reduce skewness.
In our case, we used TPMs to generate PCA plots and assess whether technical replicates group together.
Summary of steps
- Prepare a TPM matrix: samples as columns, genes as rows.
- Log-transform:
log2(TPM + 1)for visualization. - Transpose: PCA expects samples as rows.
- Run PCA: Use
prcomp()in R. - Visualize: Plot PC1 vs PC2, colored by sample condition, tissue, or replicate group.
- Interpret: If technical replicates cluster very closely → you may collapse them using
collapseReplicates()(if working with estimated counts).
PCA
run_pca_analysis <- function(counts_file,
sample_info_file,
sep_counts = ";",
sep_meta = ";",
plot_title = "PCA Plot",
x_limits = NULL,
y_limits = NULL) {
library(tidyverse)
library(ggrepel)
# Load data
counts <- read.csv(counts_file, sep = sep_counts, header = TRUE)
sample_info <- read.csv(sample_info_file, sep = sep_meta, header = TRUE)
# Standardize sample names
colnames(counts) <- tolower(colnames(counts))
sample_info$sample <- tolower(trimws(sample_info$sample))
# Remove non-sample columns
counts_clean <- counts[, !colnames(counts) %in% c("target_id", "x")]
# Check matching samples
missing <- setdiff(sample_info$sample, colnames(counts_clean))
if (length(missing) > 0) {
stop(paste("⚠️ These samples are in sample_info but not in counts:", paste(missing, collapse = ", ")))
}
# Reorder
counts_matrix <- counts_clean[, sample_info$sample]
# Log-transform and filter
log_counts <- log2(counts_matrix + 1)
log_counts_t <- t(log_counts)
variances <- apply(log_counts_t, 2, var, na.rm = TRUE)
log_counts_t <- log_counts_t[, variances != 0]
log_counts_t <- log_counts_t[, colSums(is.na(log_counts_t)) == 0]
# PCA
pca <- prcomp(log_counts_t)
percentVar <- pca$sdev^2 / sum(pca$sdev^2) * 100
pc_scores <- as_tibble(pca$x, rownames = "sample") %>%
left_join(sample_info, by = "sample")
# Plot
plot <- ggplot(pc_scores, aes(x = PC1, y = PC2, color = condition)) +
geom_point(size = 3.0, alpha = 0.9, stroke = 0.2) +
scale_color_brewer(palette = "Paired") +
stat_ellipse(type = "t", level = 0.95, linetype = "dashed", size = 1, alpha = 0.4) +
labs(
title = plot_title,
x = paste0("PC1 (", round(percentVar[1], 1), "%)"),
y = paste0("PC2 (", round(percentVar[2], 1), "%)"),
color = "Condition"
) +
theme_classic(base_size = 16) +
theme(
plot.title = element_text(face = "bold", size = 18),
axis.text = element_text(color = "black"),
legend.position = "right",
panel.grid = element_blank(),
axis.line = element_line(color = "black"),
legend.title = element_text(face = "bold")
)
# Apply fixed limits if specified
if (!is.null(x_limits)) plot <- plot + xlim(x_limits)
if (!is.null(y_limits)) plot <- plot + ylim(y_limits)
return(plot)
}
#https://r-graph-gallery.com/38-rcolorbrewers-palettes.htmlMouthparts figure
run_pca_analysis(
counts_file = "tpm_abundance_matrix_mouthparts.csv",
sample_info_file = "sample_mouthparts.csv",
sep_counts = ";",
sep_meta = ";",
plot_title = "Mouthparts"
)Warning: package 'ggrepel' was built under R version 4.3.3
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Pedipalps figure
run_pca_analysis(
counts_file = "tpm_abundance_matrix_pedipalps.csv",
sample_info_file = "sample_pedipalps.csv",
sep_counts = ";",
sep_meta = ";",
plot_title = "Pedipalps"
)Legs figure
run_pca_analysis(
counts_file = "tpm_counts_matrix_legs_sorted.tsv",
sample_info_file = "sample.csv",
sep_counts = ",",
sep_meta = ",",
plot_title = "Legs"
)Merging figures
# Define fixed axis limits for all plots
x_lim <- c(-100, 100)
y_lim <- c(-75, 75)
# Redefine the PCA plots with coord_fixed()
pca_mouthparts <- run_pca_analysis(
counts_file = "tpm_abundance_matrix_mouthparts.csv",
sample_info_file = "sample_mouthparts.csv",
sep_counts = ";",
sep_meta = ";",
plot_title = "Mouthparts PCA",
x_limits = x_lim,
y_limits = y_lim
) + coord_fixed()
pca_pedipalps <- run_pca_analysis(
counts_file = "tpm_abundance_matrix_pedipalps.csv",
sample_info_file = "sample_pedipalps.csv",
sep_counts = ";",
sep_meta = ";",
plot_title = "Pedipalps PCA",
x_limits = x_lim,
y_limits = y_lim
) + coord_fixed()
pca_legs <- run_pca_analysis(
counts_file = "tpm_counts_matrix_legs_sorted.tsv",
sample_info_file = "sample.csv",
sep_counts = ",",
sep_meta = ",",
plot_title = "Legs PCA",
x_limits = x_lim,
y_limits = y_lim
) + coord_fixed()# Add an empty plot to balance the grid (optional)
#blank_plot <- ggplot() + theme_void()
# Combine into a 2x2 layout
#final_grid <- (pca_mouthparts | pca_pedipalps) / (pca_legs | blank_plot)
# Show the final figure
#final_grid
# Save in high quality
#ggsave("PCA_all_quadrants_ellipse_05082025.tiff", plot = final_grid, width = 10, height = 10, dpi = 600)
#ggsave("PCA_all_quadrants_ellipse_05082025.pdf", plot = final_grid, width = 10, height = 10)To explore the major sources of variation in gene expression across spider appendages, we performed PCA on TPM-normalized expression data from mouthparts, pedipalps, and legs.
In mouthparts, samples clustered clearly by sex along PC1, which explained 26.6% of the total variance (Figure X, top left). This strong separation suggests that sexual dimorphism is a major driver of transcriptomic divergence in these tissues.
In pedipalps, the separation by sex was even more pronounced, with PC1 accounting for 58.3% of the variance (Figure X, top right). Male and female samples formed two distinct clusters with minimal overlap, indicating highly divergent expression profiles.
In contrast, leg samples clustered into four distinct groups corresponding to sex (male/female) and tissue subregion (proximal/distal) (Figure X, bottom). PC1 and PC2 together explained 24.7% of the variance. These results highlight both sexual and spatial transcriptional specialization within the legs.
Overall, PCA demonstrates strong transcriptomic differentiation by sex in pedipalps and mouthparts, while leg transcriptomes exhibit additional spatial complexity related to anatomical subregions.
PCA, Scree plot and biplot
Principal Component Analysis (PCA) is an essential exploratory data analysis technique in genomics. Its primary goal is to simplify a complex dataset by reducing the number of variables while preserving as much of the original data’s variation as possible. In gene expression analysis, we use PCA to understand the overall structure of our data and identify the major sources of variation, such as differences between experimental conditions, tissues, or technical batches.
- The PCA Plot (PC1 vs. PC2) The most common way to visualize PCA results is a scatter plot showing the first two principal components (PC1 and PC2).
What it shows: Each point on the plot represents a single sample. Samples that are close to each other have similar gene expression patterns.
What it tells you: A PCA plot immediately reveals if your samples group together based on your known experimental conditions (e.g., males vs. females, different tissues). Clear clustering suggests that your experimental conditions are the main drivers of gene expression differences. The percentage of variance explained by each PC, shown on the axis labels, tells you how much of the total data variability is captured by that component.
- The Scree Plot The Scree Plot is a visualization that helps you decide how many principal components are worth keeping for your analysis.
What it shows: This plot displays the percentage of the total variance explained by each principal component, ordered from greatest to least.
What it tells you: You look for an “elbow” or a point where the curve flattens out. The components to the left of the elbow are considered the most significant because they explain a disproportionately large amount of the variance. Components after the elbow explain very little of the remaining variation and are often discarded. This is a crucial step for determining if your PC1 and PC2 plots are truly representative of the major trends in your data.
- The Biplot A biplot is a more advanced visualization that combines two plots into one: the sample score plot and the variable loading plot.
What it shows: It displays the individual samples as points and the original variables (your genes) as arrows.
What it tells you:
Gene contribution: The length of an arrow indicates how much a gene contributes to the PCA. Longer arrows represent genes that are more influential in explaining the data’s variation.
Gene expression trends: The direction of an arrow shows its correlation with the principal components. For example, if a gene’s arrow points towards a specific cluster of samples, it suggests that this gene is highly expressed in those samples. This helps you identify the specific genes responsible for the observed clustering patterns.
run_pca_analysis <- function(counts_file,
sample_info_file,
sep_counts = ";",
sep_meta = ";",
plot_title = "PCA Plot") {
library(tidyverse)
library(ggrepel)
library(ggfortify)
# Load data
counts <- read.csv(counts_file, sep = sep_counts, header = TRUE)
sample_info <- read.csv(sample_info_file, sep = sep_meta, header = TRUE)
# Standardize sample names
colnames(counts) <- tolower(colnames(counts))
sample_info$sample <- tolower(trimws(sample_info$sample))
# Remove non-sample columns and reorder
counts_clean <- counts[, !colnames(counts) %in% c("target_id", "x")]
missing <- setdiff(sample_info$sample, colnames(counts_clean))
if (length(missing) > 0) {
stop(paste("⚠️ These samples are in sample_info but not in counts:", paste(missing, collapse = ", ")))
}
counts_matrix <- counts_clean[, sample_info$sample]
# Log-transform and filter
log_counts <- log2(counts_matrix + 1)
log_counts_t <- t(log_counts)
variances <- apply(log_counts_t, 2, var, na.rm = TRUE)
log_counts_t <- log_counts_t[, variances != 0]
log_counts_t <- log_counts_t[, colSums(is.na(log_counts_t)) == 0]
# Perform PCA
pca <- prcomp(log_counts_t)
percentVar <- pca$sdev^2 / sum(pca$sdev^2) * 100
pc_scores <- as_tibble(pca$x, rownames = "sample") %>%
left_join(sample_info, by = "sample")
# --- Generate Additional Plots and Tables ---
# Main PCA Plot with `Paired` color palette
pca_plot <- ggplot(pc_scores, aes(x = PC1, y = PC2, color = condition)) +
geom_point(size = 3.0, alpha = 0.9, stroke = 0.2) +
scale_color_brewer(palette = "Paired") +
stat_ellipse(type = "t", level = 0.95, linetype = "dashed", size = 1, alpha = 0.4) +
labs(
title = plot_title,
x = paste0("PC1 (", round(percentVar[1], 1), "%)"),
y = paste0("PC2 (", round(percentVar[2], 1), "%)"),
color = "Condition"
) +
theme_classic(base_size = 16)
# Scree Plot in grayscale
scree_data <- data.frame(
PC = 1:length(percentVar),
Variance_Explained = percentVar
)
scree_plot <- ggplot(scree_data, aes(x = PC, y = Variance_Explained)) +
geom_bar(stat = "identity", fill = "grey50") +
geom_point() +
geom_line(group = 1) +
scale_x_continuous(breaks = scales::breaks_pretty()) + # To have clean X-axis ticks
labs(
title = paste0("Scree Plot for ", plot_title),
x = "Principal Component",
y = "Percentage of Variance Explained"
) +
theme_classic(base_size = 16)
# Top Loadings (Top 10 for each PC)
loadings <- as.data.frame(pca$rotation)
top_pc1_loadings <- head(loadings[order(abs(loadings$PC1), decreasing = TRUE), ], 10)
top_pc2_loadings <- head(loadings[order(abs(loadings$PC2), decreasing = TRUE), ], 10)
# Biplot with `Paired` color palette
biplot <- autoplot(
pca,
data = sample_info,
colour = 'condition',
loadings = TRUE,
loadings.colour = 'black',
loadings.label = TRUE,
loadings.label.size = 3,
loadings.label.repel = TRUE
) +
scale_color_brewer(palette = "Paired") +
labs(
title = paste0("Biplot for ", plot_title),
x = paste0("PC1 (", round(percentVar[1], 1), "%)"),
y = paste0("PC2 (", round(percentVar[2], 1), "%)"),
color = "Condition"
) +
theme_classic(base_size = 16)
# Return all results in a list
return(list(
pca_plot = pca_plot,
scree_plot = scree_plot,
top_pc1_loadings = top_pc1_loadings,
top_pc2_loadings = top_pc2_loadings,
biplot = biplot
))
}Example mouthparts data
results_mouthparts <- run_pca_analysis(
counts_file = "tpm_abundance_matrix_mouthparts.csv",
sample_info_file = "sample_mouthparts.csv",
sep_counts = ";",
sep_meta = ";",
plot_title = "Mouthparts"
)Warning: package 'ggfortify' was built under R version 4.3.3
# Puedes acceder a los resultados así:
results_mouthparts$pca_plot # El gráfico principalresults_mouthparts$scree_plot # El Scree Plotresults_mouthparts$top_pc1_loadings # La tabla con los top loadings de PC1 PC1 PC2 PC3 PC4 PC5
9086 -0.09814108 -0.02412161 -0.012565060 0.0028764919 -0.0094581724
1528 -0.07621531 0.02079520 -0.014851423 0.0235238105 0.0003928857
7779 -0.06914623 -0.05177254 -0.006536295 -0.0135447517 -0.0002751259
7813 0.06895159 -0.02347019 0.005806390 0.0402700903 0.0052274167
7780 -0.06888464 -0.04665605 0.018012171 -0.0218540869 -0.0254305567
3554 -0.06691567 0.02175299 0.003519418 0.0009628532 -0.0195259395
7095 -0.06611325 -0.06895728 -0.007332291 -0.0027559091 0.0071772334
5450 0.05838478 -0.02037634 0.012338678 0.0321409119 0.0046823302
8327 0.05837528 -0.02893137 0.024864098 0.0475000262 0.0136266098
9481 0.05787969 -0.02316307 0.014258372 0.0471407170 0.0033240234
PC6 PC7 PC8 PC9 PC10
9086 -0.040300111 0.035256008 0.047901930 0.0233597866 -0.003950965
1528 0.011429196 -0.007662001 0.002784156 0.0005088135 0.021956234
7779 -0.079245849 0.001257662 0.029718091 0.0357395669 0.010327853
7813 -0.008803827 -0.009064431 0.034256836 -0.0161075305 -0.001162354
7780 -0.043197861 -0.002481396 0.030859971 0.0145438965 -0.017090989
3554 -0.002411622 0.003136325 0.016973516 -0.0097741050 0.023555370
7095 -0.064644691 -0.002440588 0.024135120 0.0270978902 0.002691567
5450 -0.012471319 0.005685321 0.014495695 -0.0139085261 -0.027643689
8327 -0.005898299 -0.007003989 0.008772931 -0.0004597064 -0.013814271
9481 -0.033182634 0.011330262 0.024520691 -0.0145302931 -0.019367793
PC11 PC12 PC13 PC14 PC15
9086 -0.034443981 -0.0026369855 -0.0005117883 0.0094159608 -1.326521e-03
1528 0.009656387 0.0011332333 -0.0033551362 0.0034508976 2.599936e-03
7779 -0.013918917 0.0007423818 0.0060811952 0.0009264069 -5.722307e-04
7813 -0.035621598 -0.0162263926 0.0119224985 0.0062967724 -5.418285e-03
7780 -0.010745666 -0.0086193228 0.0079970306 -0.0089937653 8.533943e-03
3554 0.013772653 0.0013501372 0.0027082859 0.0004624584 -1.246738e-03
7095 -0.020663991 0.0034580519 0.0007120285 0.0027990672 2.929176e-05
5450 0.028261363 -0.0014472907 0.0092665678 0.0027255298 -1.032781e-03
8327 -0.016764610 -0.0060168349 0.0078453630 0.0044147461 -6.353317e-03
9481 -0.002814759 -0.0052718683 0.0015085768 -0.0028287688 -4.201346e-03
PC16 PC17 PC18 PC19 PC20
9086 -0.0056544275 -1.666988e-03 0.012123333 0.004416369 0.0072860778
1528 0.0016493451 3.058687e-03 -0.013252212 -0.003519015 0.0144216609
7779 -0.0010165157 -3.283025e-05 -0.001375222 0.003434028 0.0004319435
7813 0.0322958442 2.387423e-02 0.010366780 0.005585950 -0.0153911274
7780 0.0010439520 -3.289283e-03 -0.019543142 -0.015116095 -0.0134885425
3554 0.0038826002 4.377421e-03 0.017104746 0.006157088 0.0092918513
7095 -0.0006853297 -4.556215e-03 0.007465244 0.007399496 -0.0033117754
5450 -0.0044489336 -1.127360e-03 0.005398567 0.002034091 -0.0085049858
8327 0.0104803986 2.118874e-03 -0.009532769 -0.009175550 0.0025310283
9481 -0.0033547250 -5.248842e-03 -0.006343997 0.000120549 -0.0021570927
PC21 PC22 PC23 PC24 PC25
9086 -0.0039233437 -1.157324e-03 -1.332625e-03 0.005895577 0.0039348254
1528 -0.0086390917 7.944095e-04 3.708293e-03 -0.002123568 0.0010545277
7779 0.0032878786 -3.108047e-04 3.648918e-05 0.002253197 -0.0002988526
7813 -0.0016173251 -1.876392e-02 -5.564359e-03 -0.012740045 -0.0313828936
7780 0.0021828712 1.118097e-03 2.887849e-04 -0.002335475 -0.0166117735
3554 -0.0063193619 1.219561e-02 8.344215e-03 0.003964360 0.0152334280
7095 0.0080658335 5.529394e-05 -3.995793e-03 0.003748645 -0.0007414758
5450 -0.0031617301 1.670373e-04 -1.356515e-02 -0.010023415 -0.0040859235
8327 -0.0134100258 4.796658e-03 1.091530e-03 0.018668141 -0.0081407351
9481 0.0008102957 -5.174720e-03 -2.013448e-02 -0.006933807 0.0051624089
PC26 PC27 PC28 PC29 PC30
9086 -0.0010342006 -0.005676462 -0.0081288119 -0.0006056192 0.0029948472
1528 -0.0127871850 -0.010039055 0.0068463931 -0.0025518403 0.0027823036
7779 0.0000703099 0.001119928 -0.0044728579 -0.0060925421 0.0017055689
7813 -0.0002575864 0.009605432 -0.0130947262 -0.0182411312 -0.0091696918
7780 0.0005729001 -0.006481093 -0.0027266266 0.0047457935 0.0015042507
3554 -0.0041661212 0.013439956 -0.0062516036 -0.0110201560 0.0105677652
7095 0.0037264039 -0.001048150 -0.0047465548 -0.0019118877 0.0014572347
5450 0.0108719163 -0.003510948 0.0003320994 -0.0030719611 -0.0003986891
8327 0.0142177151 0.021276798 0.0227838647 0.0098026332 -0.0062187702
9481 -0.0026948439 0.017788875 -0.0108771598 0.0040259733 -0.0021122729
PC31 PC32 PC33 PC34 PC35
9086 -0.002902100 -0.0077979702 0.0110113431 -1.665597e-03 -0.0011310420
1528 -0.004426732 0.0006838857 0.0015484745 -2.217286e-03 -0.0198485658
7779 -0.000217143 -0.0032302053 -0.0020962359 -6.351320e-03 0.0029332507
7813 0.006678942 0.0123182377 0.0013875959 3.781012e-05 -0.0105643241
7780 -0.001715420 -0.0049079503 -0.0012986019 -1.115503e-02 -0.0110038535
3554 -0.011399112 -0.0119617099 -0.0004587524 3.707862e-03 -0.0007693452
7095 -0.001122482 -0.0047039421 -0.0007402167 3.004599e-04 -0.0004870754
5450 -0.006529631 -0.0020201219 -0.0072828550 -8.557770e-03 0.0050723070
8327 0.009883490 -0.0003508055 -0.0173247108 -3.327030e-03 0.0032376241
9481 0.010920176 0.0021595768 -0.0105166530 1.069410e-02 -0.0038972899
PC36 PC37 PC38 PC39 PC40
9086 -0.0002090173 -0.007632893 -0.002707434 -0.008954269 -0.0060926698
1528 -0.0053194904 -0.003143221 0.018035213 -0.005633576 -0.0010681355
7779 0.0063277202 0.004082642 -0.002408464 -0.002607110 0.0007083805
7813 0.0046936533 0.012202825 0.014514310 0.012792788 -0.0171703775
7780 -0.0050110117 -0.002887375 -0.008044296 0.002684067 -0.0044182069
3554 0.0069804313 -0.013771717 -0.012525025 -0.015018996 -0.0158866346
7095 0.0026273828 0.002390753 -0.001896198 -0.001064397 0.0006814547
5450 -0.0037910545 0.003908743 -0.010941898 -0.042300229 0.0079026606
8327 -0.0068252874 -0.016992631 0.004517394 -0.015968548 0.0039247947
9481 -0.0169526805 0.001642991 -0.004764823 0.004302928 0.0115117685
PC41 PC42 PC43 PC44 PC45
9086 -0.0018495040 0.015527822 4.797238e-03 0.0114501236 -0.0207356010
1528 0.0072991397 0.006675026 -9.851274e-03 0.0040419949 -0.0078672068
7779 0.0016318730 0.004008953 -1.104774e-02 0.0030859612 -0.0045223855
7813 -0.0137979911 -0.017567959 5.441968e-03 -0.0155035105 -0.0071441588
7780 0.0011681947 0.005785420 -1.241364e-03 0.0011040493 -0.0060256460
3554 -0.0144943191 -0.020036035 1.212507e-03 0.0131622303 0.0009216936
7095 0.0001182337 0.004721824 -2.626187e-03 -0.0005980229 -0.0021739670
5450 0.0398560115 -0.023161044 -8.128939e-06 -0.0007100080 0.0051909845
8327 -0.0039576010 0.006121151 1.378065e-02 -0.0082781421 -0.0071646328
9481 -0.0125562040 0.012992688 -9.589328e-04 0.0037036378 -0.0005291834
PC46 PC47 PC48
9086 -0.0064357837 -0.037202921 -0.0190652253
1528 -0.0102201707 0.011128479 0.0064242780
7779 0.0170004606 -0.002429331 -0.0112299908
7813 -0.0003464443 -0.005447520 0.0017015316
7780 -0.0087278175 0.010713928 -0.0400070422
3554 -0.0061023879 0.010041311 0.0007205116
7095 0.0088295321 -0.001483407 0.0047093062
5450 -0.0078475564 -0.004791943 -0.0076055347
8327 0.0013879440 0.014149942 -0.0056576797
9481 0.0011203262 0.002308886 -0.0011255149
results_mouthparts$top_pc2_loadings # La tabla con los top loadings de PC2 PC1 PC2 PC3 PC4 PC5
11816 -0.03745121 -0.09877772 0.008371141 0.01082220 0.014493430
7781 -0.04225556 -0.08590844 -0.003665320 -0.01642510 0.007641583
11789 -0.03504379 -0.08331504 0.025798325 -0.05146851 0.061079709
7085 -0.05232769 -0.07858690 0.044353794 -0.04118629 0.019091991
11815 -0.03186397 -0.07622204 0.055337636 -0.05836843 -0.002513940
2466 0.01308010 -0.07545041 0.008651408 -0.03903709 0.026701152
11788 -0.03107344 -0.07375111 0.024396556 -0.04491248 0.047000357
11819 -0.01989864 -0.07355120 0.010704016 -0.04349145 0.037017218
2524 -0.02391104 -0.07172221 0.034821477 -0.02253146 0.004168430
7079 -0.04498049 -0.07171004 0.036934940 -0.03393546 0.027606575
PC6 PC7 PC8 PC9 PC10
11816 -0.071029887 -0.0006106347 0.016035750 0.0067170446 0.006577404
7781 -0.010745037 0.0161923608 -0.004530384 -0.0006852968 -0.014805272
11789 -0.046148250 -0.0219949034 -0.058674315 -0.0303495899 0.006298675
7085 -0.028720938 -0.0284051187 0.028470952 -0.0187917756 -0.004574314
11815 -0.022284589 -0.0118625865 -0.064211376 -0.0105527581 -0.013344875
2466 -0.050112450 0.0085335153 0.013555995 -0.0573130701 -0.042059132
11788 -0.005876307 0.0038290043 -0.025481470 -0.0246745714 0.011983410
11819 -0.053638922 -0.0124920144 -0.027030098 0.0047878389 0.018143158
2524 -0.058626776 -0.0304704240 -0.014939354 0.0012958157 -0.033364888
7079 -0.031658237 -0.0430623833 -0.014369359 -0.0312172441 -0.018894123
PC11 PC12 PC13 PC14 PC15
11816 -0.008621010 -0.0001581520 0.005248951 4.380715e-03 -0.0015705266
7781 -0.020148814 0.0148220365 0.010058177 4.738621e-03 -0.0041781046
11789 0.001196176 0.0050297891 0.012860746 -1.363048e-03 -0.0040249533
7085 -0.013069979 -0.0030062368 0.002727961 8.278520e-03 -0.0017941830
11815 -0.016083659 0.0146902042 -0.023437320 5.075872e-03 0.0087413429
2466 0.053065772 -0.0031056733 0.004539871 4.664511e-05 -0.0071931782
11788 0.039014842 0.0047629217 -0.011520344 -2.304179e-04 0.0005431770
11819 -0.009646373 0.0075369755 0.003361084 -2.848867e-03 -0.0017086905
2524 -0.031527842 0.0009131745 0.008408710 5.147914e-03 -0.0013310329
7079 -0.021548853 -0.0011811862 -0.004734314 1.116744e-02 -0.0004518578
PC16 PC17 PC18 PC19 PC20
11816 -0.0041305952 0.0018237371 0.0079077600 -0.0077621382 0.0093224325
7781 -0.0009700069 0.0038807437 0.0011217361 -0.0102119942 0.0077193584
11789 0.0048090764 0.0030884740 0.0431959840 0.0096039451 0.0158289619
7085 0.0014870337 0.0033646182 0.0100372414 -0.0366303822 0.0236218529
11815 0.0145859415 0.0015632559 -0.0004016836 0.0057841552 0.0349887891
2466 0.0027043839 0.0079929859 0.0085393870 -0.0074036522 -0.0028171164
11788 0.0064425223 -0.0008111469 -0.0119176764 0.0002637689 0.0066876880
11819 0.0002973534 0.0019330162 -0.0009786640 0.0083184021 0.0005322632
2524 -0.0038271801 -0.0012469877 0.0245451491 -0.0256470770 0.0105541826
7079 0.0031197029 -0.0069378393 -0.0068049383 -0.0197159517 -0.0111067911
PC21 PC22 PC23 PC24 PC25
11816 -1.835038e-03 -0.0023157171 0.0008897277 0.0072063374 -0.007304312
7781 -3.086455e-03 -0.0060925398 0.0033033760 0.0033253108 -0.013823559
11789 -5.292323e-03 0.0024318115 0.0018825734 -0.0261209047 0.018604577
7085 -8.537280e-03 -0.0069595450 -0.0025842827 -0.0048462867 -0.010808363
11815 -3.281865e-02 -0.0273261526 0.0313273603 0.0285721266 0.010683680
2466 -2.586078e-03 0.0130601246 -0.0030911603 -0.0114403024 0.001303321
11788 1.559703e-03 -0.0057739951 0.0062642131 0.0027083936 -0.002576175
11819 -4.938050e-03 -0.0009723118 -0.0029787594 -0.0066995737 -0.005531365
2524 -8.823700e-03 -0.0004912964 0.0034313766 0.0007338782 -0.002332085
7079 -2.484315e-06 0.0039735774 0.0002871703 -0.0043363446 -0.012857107
PC26 PC27 PC28 PC29 PC30
11816 0.0006641651 0.0004313872 -0.013792291 -0.0005797079 -0.0063880062
7781 0.0199563579 0.0012805460 -0.020003535 -0.0011771516 -0.0119190636
11789 0.0059225351 -0.0116895082 0.006888456 0.0031739555 -0.0174609586
7085 0.0062528284 -0.0038049849 -0.009991975 0.0035270033 -0.0002146547
11815 0.0131521779 -0.0153269696 -0.047636104 -0.0115370941 0.0116499853
2466 0.0021896040 -0.0064252112 -0.005588210 0.0001451892 0.0080350454
11788 0.0132245835 -0.0176876063 -0.012057214 -0.0035393488 0.0031516041
11819 0.0066731352 0.0050563178 0.009187042 -0.0086114160 -0.0093221239
2524 0.0031475705 0.0030567255 -0.014691633 -0.0047948017 -0.0126643266
7079 0.0045005947 0.0009646853 0.002858522 -0.0111365272 0.0040613832
PC31 PC32 PC33 PC34 PC35
11816 -0.0037368764 -0.0038947220 -0.0005410008 0.0059624601 -0.009156611
7781 0.0006888449 -0.0122833518 -0.0019858976 -0.0000547042 0.003677459
11789 0.0011097748 -0.0063315053 0.0290117088 0.0199553033 -0.018883446
7085 0.0021926905 0.0095411906 -0.0119655270 -0.0192237829 0.011196027
11815 -0.0091963642 -0.0008259441 -0.0300023264 -0.0116068237 0.005579303
2466 -0.0064441977 0.0002998723 0.0068706736 0.0025454298 0.010589951
11788 -0.0040489683 0.0065913713 0.0032923302 -0.0099385183 -0.007798115
11819 0.0048392104 -0.0036279651 -0.0082335279 0.0013196240 0.012304036
2524 0.0018746328 0.0053570126 0.0067228900 0.0018061307 -0.005322999
7079 0.0046453444 0.0028927851 -0.0044873435 -0.0004754241 0.006900985
PC36 PC37 PC38 PC39 PC40
11816 0.001279035 -0.000065646 -0.011179197 -0.007601291 0.0009312665
7781 -0.008403350 -0.005407805 -0.002504939 -0.011376960 0.0053615286
11789 0.016582177 0.028085407 -0.010924770 0.047525777 0.0058763693
7085 0.002754253 0.006552920 0.006539880 0.000275554 -0.0008609969
11815 0.010040114 0.002554673 0.009179884 0.005696559 -0.0000740802
2466 -0.002976382 -0.002510414 0.009939170 -0.001172958 -0.0137615289
11788 -0.011386930 -0.007002010 0.006000413 0.022673205 -0.0058617809
11819 -0.020052758 0.003326294 -0.004116040 -0.003714350 0.0010275596
2524 -0.002389921 -0.003763697 -0.005068825 0.001848384 0.0018644888
7079 -0.009540469 -0.011291535 0.008570800 -0.009198282 0.0074012137
PC41 PC42 PC43 PC44 PC45
11816 -0.0006862523 0.009092179 0.0059986882 0.0037055471 0.014743647
7781 -0.0124363157 -0.002912565 0.0026876140 -0.0032409443 -0.001301784
11789 0.0035825968 -0.002250872 -0.0047695365 0.0045747936 0.009233318
7085 0.0015195829 0.001032578 -0.0227066163 -0.0001848902 -0.007814055
11815 0.0134587338 -0.004131190 0.0249484168 -0.0014560206 0.008834945
2466 0.0023845566 0.011523453 0.0057027711 -0.0036221609 -0.009706634
11788 0.0025383368 -0.009421934 0.0006058253 -0.0125038602 -0.024302354
11819 0.0262596420 -0.012041330 -0.0139347153 0.0050141244 0.016430338
2524 -0.0017335617 0.002186851 -0.0015064047 -0.0009525206 0.001882066
7079 -0.0080630279 0.005484615 0.0054595949 -0.0195240132 0.005752621
PC46 PC47 PC48
11816 -0.0010271266 -0.0033302213 -0.0060834927
7781 0.0008990385 0.0022544252 0.0009047707
11789 0.0003719057 -0.0099266015 0.0022713108
7085 0.0344911072 -0.0065385694 0.0008679558
11815 0.0043601613 -0.0601705383 0.0044831129
2466 -0.0068804934 0.0002611288 -0.0015808718
11788 0.0099734871 0.0098594497 -0.0124435957
11819 0.0202119051 0.0332982949 -0.0090143018
2524 0.0072784363 0.0037534144 0.0101222646
7079 0.0039413216 -0.0120223706 -0.0055901140
results_mouthparts$biplot # El BiplotWarning: ggrepel: 13104 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Deseq2 leg data
What to do after running Kallisto
After running Kallisto on all the RNA-seq samples, you should have one abundance.tsv file per sample. These files contain transcript-level quantifications with the following columns:
target_id: transcript name
length: transcript length
eff_length: effective length
est_counts: estimated number of reads
tpm: transcripts per million (normalized expression)
Step 1: Prepare a Sample Metadata File
Create a metadata file (e.g., sample_metadata.tsv) with information about each sample and the corresponding abundance.tsv file:
Step 2: Import Quantifications Using tximport in R
Kallisto quantifies transcripts, but many downstream tools (like DESeq2) work with gene-level data. To import and process your results in R:
library(tximport)
library(readr)sample_table <- read_tsv("sample_metadata_legs.tsv")Rows: 96 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): sample_id, condition, path
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
files <- file.path(sample_table$path, "abundance.tsv")
names(files) <- sample_table$sample_idStep 3: get the tx2gene.tsv
grep ">" Spider_Transcriptome.fasta.transdecoder.cds | \
sed 's/>//' | \
awk '{split($2,a,"[.~]"); print $1"\t"a[2]"."a[3]"."a[4]}' > tx2gene.tsvgrep ">" CD_HIT-EST_Spider_Transcriptome.fasta.transdecoder_header_reeplaced.fasta \
| sed 's/>//' \
| awk -F '.' '{
gene_id = $1 "." $2 "." $3;
print $0 "\t" gene_id
}' > tx2gene.tsvtx2gene <- read_tsv("tx2gene.tsv", col_names = c("TXNAME", "GENEID"))Rows: 59757 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): TXNAME, GENEID
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(tx2gene)# A tibble: 6 × 2
TXNAME GENEID
<chr> <chr>
1 STRG.100.1.p2 STRG.100.1
2 STRG.100.1.p1 STRG.100.1
3 STRG.100.1.p3 STRG.100.1
4 STRG.100.1.p4 STRG.100.1
5 STRG.10002.1.p1 STRG.10002.1
6 STRG.10006.1.p1 STRG.10006.1
txi <- tximport(files, type = "kallisto", tx2gene = tx2gene)Note: importing `abundance.h5` is typically faster than `abundance.tsv`
reading in files with read_tsv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
transcripts missing from tx2gene: 1
summarizing abundance
summarizing counts
summarizing length
str(txi)List of 4
$ abundance : num [1:48660, 1:96] 4.79 8.04 16.59 15.77 0 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:48660] "Replacing_STRG.11767.1" "STRG.100.1" "STRG.10002.1" "STRG.10006.1" ...
.. ..$ : chr [1:96] "S412_S1_L001" "S412_S1_L002" "S412_S1_L003" "S412_S1_L004" ...
$ counts : num [1:48660, 1:96] 36 17.6 5 68 0 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:48660] "Replacing_STRG.11767.1" "STRG.100.1" "STRG.10002.1" "STRG.10006.1" ...
.. ..$ : chr [1:96] "S412_S1_L001" "S412_S1_L002" "S412_S1_L003" "S412_S1_L004" ...
$ length : num [1:48660, 1:96] 1944 567 78 1116 350 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:48660] "Replacing_STRG.11767.1" "STRG.100.1" "STRG.10002.1" "STRG.10006.1" ...
.. ..$ : chr [1:96] "S412_S1_L001" "S412_S1_L002" "S412_S1_L003" "S412_S1_L004" ...
$ countsFromAbundance: chr "no"
head(txi$counts) S412_S1_L001 S412_S1_L002 S412_S1_L003 S412_S1_L004
Replacing_STRG.11767.1 36.00000 36.0000 37.00000 44.0000
STRG.100.1 17.58651 15.0042 8.05311 12.8948
STRG.10002.1 5.00000 3.0000 3.00000 5.0000
STRG.10006.1 68.00000 51.0000 74.00000 73.0000
STRG.10007.1 0.00000 0.0000 0.00000 0.0000
STRG.10008.1 53.00000 46.0000 47.00000 52.0000
S413_S2_L001 S413_S2_L002 S413_S2_L003 S413_S2_L004
Replacing_STRG.11767.1 19.00000 28.0000 23.00000 30.00000
STRG.100.1 21.09364 13.9723 15.36790 21.61947
STRG.10002.1 11.00000 6.0000 7.91369 8.00000
STRG.10006.1 42.00000 32.0000 49.00000 45.00000
STRG.10007.1 0.00000 0.0000 1.00000 0.00000
STRG.10008.1 73.00000 66.0000 68.00000 55.00000
S414_S3_L001 S414_S3_L002 S414_S3_L003 S414_S3_L004
Replacing_STRG.11767.1 60.00000 43.00000 58.00000 56.00000
STRG.100.1 25.23527 44.36466 29.96445 40.45103
STRG.10002.1 5.65434 4.00000 3.00000 1.00000
STRG.10006.1 69.00000 67.00000 77.00000 81.00000
STRG.10007.1 0.00000 0.00000 1.00000 0.00000
STRG.10008.1 50.00000 53.00000 67.00000 56.00000
S415_S4_L001 S415_S4_L002 S415_S4_L003 S415_S4_L004
Replacing_STRG.11767.1 37.00000 40.00000 42.00000 45.0000
STRG.100.1 25.93116 16.21657 17.67708 18.1504
STRG.10002.1 8.96100 6.00000 3.91046 3.0000
STRG.10006.1 53.00000 61.00000 55.00000 45.0000
STRG.10007.1 0.00000 1.00000 0.00000 0.0000
STRG.10008.1 73.00000 56.00000 73.00000 64.0000
S416_S5_L001 S416_S5_L002 S416_S5_L003 S416_S5_L004
Replacing_STRG.11767.1 76.00000 61.00000 55.000000 51.00000
STRG.100.1 14.96924 15.46687 8.900654 18.94434
STRG.10002.1 6.00000 7.00000 7.000000 10.00000
STRG.10006.1 83.00000 64.00000 48.000000 50.00000
STRG.10007.1 0.00000 0.00000 0.000000 0.00000
STRG.10008.1 38.00000 49.00000 57.000000 52.00000
S417_S6_L001 S417_S6_L002 S417_S6_L003 S417_S6_L004
Replacing_STRG.11767.1 34.00000 36.0000 30.000000 34.00000
STRG.100.1 11.07388 5.1511 2.898219 15.60513
STRG.10002.1 5.79235 4.0000 7.000000 9.91312
STRG.10006.1 32.00000 33.0000 32.000000 22.00000
STRG.10007.1 0.00000 0.0000 0.000000 0.00000
STRG.10008.1 50.00000 48.0000 37.000000 45.00000
S418_S7_L001 S418_S7_L002 S418_S7_L003 S418_S7_L004
Replacing_STRG.11767.1 39.0000 45.00000 44.0000 48.00000
STRG.100.1 24.3587 15.66035 23.0481 18.47636
STRG.10002.1 11.0000 8.00000 6.0000 9.89701
STRG.10006.1 69.0000 91.00000 99.0000 77.00000
STRG.10007.1 0.0000 0.00000 0.0000 0.00000
STRG.10008.1 70.0000 79.00000 73.0000 73.00000
S419_S8_L001 S419_S8_L002 S419_S8_L003 S419_S8_L004
Replacing_STRG.11767.1 27.00000 35.00000 26.00000 29.00000
STRG.100.1 8.97996 15.07399 8.68057 11.61367
STRG.10002.1 3.00000 4.00000 5.54872 4.00000
STRG.10006.1 70.00000 80.00000 72.00000 66.00000
STRG.10007.1 1.00000 0.00000 0.00000 0.00000
STRG.10008.1 60.00000 79.00000 82.00000 75.00000
S420_S9_L001 S420_S9_L002 S420_S9_L003 S420_S9_L004
Replacing_STRG.11767.1 30.00000 34.0000 35.00000 34.00000
STRG.100.1 14.80549 17.2294 13.11472 13.71847
STRG.10002.1 3.00000 6.0000 6.00000 5.00000
STRG.10006.1 44.00000 42.0000 48.00000 45.00000
STRG.10007.1 0.00000 0.0000 0.00000 1.00000
STRG.10008.1 42.00000 34.0000 31.00000 28.00000
S421_S10_L001 S421_S10_L002 S421_S10_L003 S421_S10_L004
Replacing_STRG.11767.1 19.00000 18.00000 8.00000 19.00000
STRG.100.1 13.40461 8.95026 14.38660 13.09142
STRG.10002.1 7.87257 2.76595 8.91261 6.00000
STRG.10006.1 37.00000 29.00000 39.00000 38.00000
STRG.10007.1 0.00000 0.00000 0.00000 0.00000
STRG.10008.1 41.00000 40.00000 52.00000 49.00000
S422_S11_L001 S422_S11_L002 S422_S11_L003 S422_S11_L004
Replacing_STRG.11767.1 67.00000 66.00000 74.00000 57.00000
STRG.100.1 19.49340 8.49544 31.61923 23.16305
STRG.10002.1 4.82862 9.96182 7.00000 5.00000
STRG.10006.1 118.00000 119.00000 113.00000 100.00000
STRG.10007.1 0.00000 0.00000 0.00000 0.00000
STRG.10008.1 72.00000 57.00000 57.00000 58.00000
S423_S12_L001 S423_S12_L002 S423_S12_L003 S423_S12_L004
Replacing_STRG.11767.1 33.00000 61.00000 30.00000 37.00000
STRG.100.1 15.64289 15.05804 16.67203 7.77217
STRG.10002.1 7.00000 3.00000 2.00000 2.00000
STRG.10006.1 91.00000 86.00000 96.00000 97.00000
STRG.10007.1 0.00000 0.00000 0.00000 0.00000
STRG.10008.1 53.00000 70.00000 75.00000 74.00000
S496_S1_L001 S496_S1_L002 S496_S1_L003 S496_S1_L004
Replacing_STRG.11767.1 72.0000 92.00000 97.00000 89.00000
STRG.100.1 11.0024 11.71428 7.75817 10.96523
STRG.10002.1 6.0000 4.80700 3.82980 3.69843
STRG.10006.1 95.0000 58.00000 69.00000 91.00000
STRG.10007.1 0.0000 0.00000 0.00000 0.00000
STRG.10008.1 78.0000 73.00000 60.00000 74.00000
S497_S2_L001 S497_S2_L002 S497_S2_L003 S497_S2_L004
Replacing_STRG.11767.1 72.00000 82.00000 65.00000 85.00000
STRG.100.1 13.01435 8.18378 4.94086 11.15118
STRG.10002.1 5.19714 3.00000 6.79145 4.32871
STRG.10006.1 23.00000 23.00000 27.00000 28.00000
STRG.10007.1 0.00000 0.00000 0.00000 0.00000
STRG.10008.1 85.00000 74.00000 78.00000 73.00000
S498_S3_L001 S498_S3_L002 S498_S3_L003 S498_S3_L004
Replacing_STRG.11767.1 148.000000 146.00000 156.00000 180.00000
STRG.100.1 1.894956 8.54695 12.14463 6.53818
STRG.10002.1 7.485200 4.28585 4.11113 4.86475
STRG.10006.1 103.000000 94.00000 91.00000 128.00000
STRG.10007.1 0.000000 1.00000 0.00000 0.00000
STRG.10008.1 65.000000 84.00000 83.00000 90.00000
S499_S4_L001 S499_S4_L002 S499_S4_L003 S499_S4_L004
Replacing_STRG.11767.1 115.00000 116.00000 127.00000 127.00000
STRG.100.1 5.67383 11.78732 11.86403 9.62132
STRG.10002.1 4.14841 1.68059 4.00000 1.00000
STRG.10006.1 88.00000 94.00000 91.00000 71.00000
STRG.10007.1 0.00000 0.00000 0.00000 0.00000
STRG.10008.1 81.00000 91.00000 100.00000 93.00000
S500_S5_L001 S500_S5_L002 S500_S5_L003 S500_S5_L004
Replacing_STRG.11767.1 43.000000 60.0000 58.00000 59.00000
STRG.100.1 8.673382 9.7602 5.56031 12.27972
STRG.10002.1 8.259480 3.0000 2.24002 7.68625
STRG.10006.1 27.000000 49.0000 33.00000 41.00000
STRG.10007.1 0.000000 0.0000 2.00000 0.00000
STRG.10008.1 57.000000 52.0000 60.00000 51.00000
S501_S6_L001 S501_S6_L002 S501_S6_L003 S501_S6_L004
Replacing_STRG.11767.1 71.00000 64.00000 59.00000 64.00000
STRG.100.1 11.29925 13.45682 10.28725 8.68875
STRG.10002.1 2.17064 5.31865 3.70688 3.55327
STRG.10006.1 20.00000 17.00000 16.00000 23.00000
STRG.10007.1 0.00000 0.00000 0.00000 1.00000
STRG.10008.1 66.00000 60.00000 72.00000 89.00000
S502_S7_L001 S502_S7_L002 S502_S7_L003 S502_S7_L004
Replacing_STRG.11767.1 57.00000 53.00000 43.00000 64.00000
STRG.100.1 4.90187 11.39974 12.78450 7.97952
STRG.10002.1 4.68219 4.95005 2.85195 2.91939
STRG.10006.1 21.00000 21.00000 24.00000 20.00000
STRG.10007.1 0.00000 0.00000 0.00000 0.00000
STRG.10008.1 40.00000 44.00000 53.00000 51.00000
S503_S8_L001 S503_S8_L002 S503_S8_L003 S503_S8_L004
Replacing_STRG.11767.1 56.00000 70.00000 65.0000 58.0000
STRG.100.1 17.10180 14.59605 9.3638 10.2931
STRG.10002.1 5.16936 4.56377 1.4912 10.3117
STRG.10006.1 11.00000 20.00000 26.0000 30.0000
STRG.10007.1 0.00000 0.00000 0.0000 0.0000
STRG.10008.1 60.00000 47.00000 61.0000 81.0000
S504_S9_L001 S504_S9_L002 S504_S9_L003 S504_S9_L004
Replacing_STRG.11767.1 29.00000 38.0000 41.00000 60.00000
STRG.100.1 16.70080 13.9396 5.37119 16.83315
STRG.10002.1 6.89376 0.0000 1.77918 5.00000
STRG.10006.1 51.00000 34.0000 24.00000 50.00000
STRG.10007.1 0.00000 0.0000 0.00000 0.00000
STRG.10008.1 57.00000 56.0000 56.00000 52.00000
S505_S10_L001 S505_S10_L002 S505_S10_L003 S505_S10_L004
Replacing_STRG.11767.1 50.00000 52.00000 34.00000 44.00000
STRG.100.1 2.82538 8.08838 18.55874 7.41804
STRG.10002.1 2.62799 5.62574 6.80990 3.76525
STRG.10006.1 12.00000 25.00000 16.00000 16.00000
STRG.10007.1 0.00000 0.00000 0.00000 0.00000
STRG.10008.1 72.00000 67.00000 77.00000 86.00000
S506_S11_L001 S506_S11_L002 S506_S11_L003 S506_S11_L004
Replacing_STRG.11767.1 34.00000 47.00000 28.00000 41.00000
STRG.100.1 7.64892 0.00000 1.70934 6.95797
STRG.10002.1 5.00000 3.45835 3.77008 4.42912
STRG.10006.1 37.00000 36.00000 40.00000 40.00000
STRG.10007.1 0.00000 0.00000 0.00000 0.00000
STRG.10008.1 53.00000 53.00000 45.00000 38.00000
S507_S12_L001 S507_S12_L002 S507_S12_L003 S507_S12_L004
Replacing_STRG.11767.1 36.00000 43.00000 46.00000 46.000000
STRG.100.1 0.00000 6.83127 11.46230 3.088915
STRG.10002.1 7.75685 2.00000 3.41329 7.529120
STRG.10006.1 32.00000 41.00000 42.00000 44.000000
STRG.10007.1 0.00000 0.00000 0.00000 0.000000
STRG.10008.1 53.00000 60.00000 55.00000 82.000000
Run Deseq2
# read in sample info
info3 <-read.csv("sample.csv", sep = ",", header = T, row.names = 1)
head(info3) condition replicate
S412_S1_L001 Male_distal Male_distal_1
S412_S1_L002 Male_distal Male_distal_1
S412_S1_L003 Male_distal Male_distal_1
S412_S1_L004 Male_distal Male_distal_1
S413_S2_L001 Male_proximal Male_proximal_1
S413_S2_L002 Male_proximal Male_proximal_1
info3$condition <- factor(info3$condition)# Check that the number of columns are the same to the number of row
all(colnames(txi) %in% rownames(info3))[1] TRUE
# are they in the same order?
all(colnames(txi) == rownames(info3))[1] TRUE
# Crear el DESeqDataSet desde tximport
dds_kallisto <- DESeqDataSetFromTximport(txi,
colData = info3,
design = ~ condition)using counts and average transcript lengths from tximport
dds_kallisto_collapsed <- collapseReplicates(dds_kallisto,
groupby = info3$replicate,
run = rownames(info3),
renameCols = TRUE)Warning in collapseReplicates(dds_kallisto, groupby = info3$replicate, run = rownames(info3), :
Warning! collapseReplicates only sums columns of the 'counts' assay.
Other assays are dropped from output, and must be manually combined,
as it is not unambiguous how to combine non-count assays.
When and how to filter low counts?
Before running differential expression analysis, it’s good practice to filter out genes with very low expression across all samples. These genes typically represent transcriptional noise and add little to no value in downstream analysis. Removing them reduces computational time and improves visualizations.
In this project, I used TPM (Transcripts Per Million) values to evaluate gene expression across samples. Specifically, we calculated the minimum TPM per gene across all samples and examined its distribution in log10 scale.
The following plot shows the distribution of minimum TPM per gene, with vertical lines indicating potential filtering thresholds (TPM ≥ 1, 2, and 4). Genes to the right of the threshold line are retained.
TPM distribution
# 1. Extract TPM matrix from tximport object
tpm_matrix <- txi$abundance
# 2. Calculate the minimum TPM per gene across all samples
min_tpm_per_gene <- apply(tpm_matrix, 1, min)
# 3. Log10 transform to visualize
log10_min_tpm <- log10(min_tpm_per_gene + 1e-6) # avoid log10(0)
# 4. Create dataframe for plotting
df_tpm <- data.frame(
TPM_min = log10_min_tpm,
group = "Genes"
)
# 5. Count number of genes per threshold
n_tpm1 <- sum(min_tpm_per_gene >= 1)
n_tpm2 <- sum(min_tpm_per_gene >= 2)
n_tpm4 <- sum(min_tpm_per_gene >= 4)
total_genes <- length(min_tpm_per_gene)
# 6. Load ggplot2
library(ggplot2)
# 7. Create violin plot
ggplot(df_tpm, aes(x = TPM_min, y = group)) +
geom_violin(fill = "mediumpurple", alpha = 0.4, trim = FALSE) +
geom_jitter(height = 0.15, alpha = 0.2, size = 0.5) +
# Vertical lines for cutoffs
geom_vline(xintercept = log10(1), linetype = "dashed", color = "red") +
geom_vline(xintercept = log10(2), linetype = "dashed", color = "blue") +
geom_vline(xintercept = log10(4), linetype = "dashed", color = "grey") +
# Add gene counts as annotations
annotate("text", x = log10(1), y = 1.2, label = paste0("≥1 TPM: ", n_tpm1), color = "red", angle = 90, vjust = -0.5, size = 4) +
annotate("text", x = log10(2), y = 1.2, label = paste0("≥2 TPM: ", n_tpm2), color = "blue", angle = 90, vjust = -0.5, size = 4) +
annotate("text", x = log10(4), y = 1.2, label = paste0("≥4 TPM: ", n_tpm4), color = "darkgray", angle = 90, vjust = -0.5, size = 4) +
# Axis scale in log10
scale_x_continuous(
name = "Minimum TPM per gene (log10 scale)",
breaks = log10(c(0.1, 1, 2, 4, 10, 100)),
labels = c("0.1", "1", "2", "4", "10", "100")
) +
labs(
title = "Distribution of Minimum TPM per Gene",
y = ""
) +
theme_minimal(base_size = 14)From the plot, we observe that:
- Many genes have very low minimum TPMs (left tail of the distribution).
- A threshold of TPM ≥ 1 retains 9526 genes (≈ 19.6%).
- A threshold of TPM ≥ 2 retains 7540 genes.
- A threshold of TPM ≥ 4 (as used in reference study) retains 5214 genes.
👉 In this study, we chose a threshold of TPM ≥ 2 as a balance between sensitivity and stringency.
Filtering genes with low expression
# Here we are filtering genes with low expression
keep <- rowSums(counts(dds_kallisto_collapsed)) >= 2
dds_kallisto_collapsed <- dds_kallisto_collapsed[keep, ]Set reference level
By default, R assigns factor levels alphabetically, which means DESeq2 will use the first level (alphabetically) as the reference unless you specify otherwise. If you don’t define a reference level, all comparisons will follow this alphabetical order. There are two ways to control which group is used as the reference: 1. Use the contrast argument in the results() function to specify the exact comparison (you’ll see how to do this later). 2. Manually set the reference level of your factor variable before running DESeq() or nbinomWaldTest()/nbinomLRT(), so the change is applied properly.
To change factor levels, you can use the factor() function.
dds$condition <- factor(dds$condition, levels = c("untreated","treated"))In my case my reference level
# reference level
dds_kallisto_collapsed$condition <- relevel(dds_kallisto_collapsed$condition, ref = "Male_distal")Run main Deseq2
# run main DESeq2
ddsDE <- DESeq(dds_kallisto_collapsed)estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
#sumary of the results
res_default <- results(ddsDE)
summary(res_default)
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 3300, 6.9%
LFC < 0 (down) : 3857, 8%
outliers [1] : 155, 0.32%
low counts [2] : 9301, 19%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
resultsNames(ddsDE)[1] "Intercept"
[2] "condition_Female_distal_vs_Male_distal"
[3] "condition_Female_proximal_vs_Male_distal"
[4] "condition_Male_proximal_vs_Male_distal"
Ordering DESeq2 results and exporting to CSV
• res_default[order(res_default$padj), ]:
This line orders the result table by adjusted p-values (padj) in ascending order. Genes with the smallest (most significant) p-adjusted values will appear at the top. • resOrdered_default <- …: The ordered result is assigned to a new object called resOrdered_default. • write.csv(…): This line exports the ordered results into a CSV file named “resOrdered_padj_default_collapsed.csv”, which can be opened in Excel or used for further analysis (e.g., volcano plots, enrichment analysis).
Why is this important? • Sorting by adjusted p-value helps highlight the most significant differentially expressed genes. • Exporting the file allows for easy sharing, inspection, and integration with other tools or analyses. • The suffix collapsed in the filename reminds us that technical replicates were collapsed prior to DESeq2 analysis.
resOrdered_default <- res_default[order(res_default$padj), ]
write.csv(resOrdered_default, "resOrdered_padj_default_collapsed.csv")resOrdered_default <- res_default[order(res_default$padj), ]
write.csv(resOrdered_default, "resOrdered_padj_default_collapsed.csv")1. Male_distal vs proximal
res_male_distal_vs_male_proximal <- results(ddsDE, contrast = c("condition", "Male_distal", "Male_proximal"))
summary(res_male_distal_vs_male_proximal)
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 3853, 8%
LFC < 0 (down) : 3300, 6.9%
outliers [1] : 155, 0.32%
low counts [2] : 9301, 19%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
2.Female proximal vs Male proximal
res_female_proximal_vs_male_proximal <- results(ddsDE, contrast = c("condition", "Female_proximal", "Male_proximal"))
summary(res_female_proximal_vs_male_proximal )
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 4853, 10%
LFC < 0 (down) : 6539, 14%
outliers [1] : 155, 0.32%
low counts [2] : 9301, 19%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
3. Female distal vs Male proximal
res_female_distal_vs_male_proximal <- results(ddsDE, contrast = c("condition", "Female_distal", "Male_proximal"))
summary(res_female_distal_vs_male_proximal)
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 5604, 12%
LFC < 0 (down) : 5922, 12%
outliers [1] : 155, 0.32%
low counts [2] : 10229, 21%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
4. Female distal vs Female proximal
res_female_distal_vs_female_proximal <- results(ddsDE, contrast = c("condition", "Female_distal", "Female_proximal"))
summary(res_female_distal_vs_female_proximal)
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 1988, 4.1%
LFC < 0 (down) : 1536, 3.2%
outliers [1] : 155, 0.32%
low counts [2] : 12088, 25%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Exporting results in csv files
# Define a helper function to save DESeq2 contrast results
save_deseq2_results <- function(dds_object, contrast_vector, filename_prefix) {
# Run DESeq2 contrast
res <- results(dds_object, contrast = contrast_vector)
# Order by adjusted p-value
res_ordered <- res[order(res$padj), ]
# Export to CSV
filename <- paste0("resOrdered_padj_", filename_prefix, ".csv")
write.csv(res_ordered, file = filename)
# Return summary (optional)
return(summary(res))
}# 1. Male distal vs male proximal
save_deseq2_results(ddsDE, c("condition", "Male_distal", "Male_proximal"), "MaleDistal_vs_MaleProximal")
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 3853, 8%
LFC < 0 (down) : 3300, 6.9%
outliers [1] : 155, 0.32%
low counts [2] : 9301, 19%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# 2. Female proximal vs male proximal
save_deseq2_results(ddsDE, c("condition", "Female_proximal", "Male_proximal"), "FemaleProximal_vs_MaleProximal")
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 4853, 10%
LFC < 0 (down) : 6539, 14%
outliers [1] : 155, 0.32%
low counts [2] : 9301, 19%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# 3. Female distal vs male proximal
save_deseq2_results(ddsDE, c("condition", "Female_distal", "Male_proximal"), "FemaleDistal_vs_MaleProximal")
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 5604, 12%
LFC < 0 (down) : 5922, 12%
outliers [1] : 155, 0.32%
low counts [2] : 10229, 21%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# 4. Female distal vs female proximal
save_deseq2_results(ddsDE, c("condition", "Female_distal", "Female_proximal"), "FemaleDistal_vs_FemaleProximal")
out of 47982 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 1988, 4.1%
LFC < 0 (down) : 1536, 3.2%
outliers [1] : 155, 0.32%
low counts [2] : 12088, 25%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Volcano plots of differential gene expression
These volcano plots show the distribution of differentially expressed genes (DEGs) across four biological comparisons:
- Male distal vs Male proximal
- Female proximal vs Male proximal
- Female distal vs Male proximal
- Female distal vs Female proximal
library(RColorBrewer)
plot_volcano_paired <- function(res, title) {
df <- as.data.frame(res)
df$gene <- rownames(df)
# Categorización
df$sig <- "Not significant"
df$sig[df$padj < 0.05 & df$log2FoldChange > 1] <- "Upregulated"
df$sig[df$padj < 0.05 & df$log2FoldChange < -1] <- "Downregulated"
df$sig <- factor(df$sig, levels = c("Upregulated", "Downregulated", "Not significant"))
# Paleta Paired
paired_colors <- brewer.pal(3, "Paired")
ggplot(df, aes(x = log2FoldChange, y = -log10(padj), color = sig)) +
geom_point(alpha = 0.7, size = 1.2) +
scale_color_manual(values = c("Upregulated" = paired_colors[2],
"Downregulated" = paired_colors[4],
"Not significant" = "gray80")) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
labs(title = title,
x = expression(Log[2]~Fold~Change),
y = expression(-Log[10]~adjusted~p~value),
color = "Regulation") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.title = element_text(face = "bold"),
legend.position = "bottom"
)
}p1 <- plot_volcano_paired(res_male_distal_vs_male_proximal, "Male distal vs Male proximal")
p2 <- plot_volcano_paired(res_female_proximal_vs_male_proximal, "Female proximal vs Male proximal")
p3 <- plot_volcano_paired(res_female_distal_vs_male_proximal, "Female distal vs Male proximal")
p4 <- plot_volcano_paired(res_female_distal_vs_female_proximal, "Female distal vs Female proximal")library(patchwork)Warning: package 'patchwork' was built under R version 4.3.3
panel <- (p1 | p2) / (p3 | p4)
panelWarning: Removed 9456 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 9456 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 10384 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12243 rows containing missing values or values outside the scale range
(`geom_point()`).
ggsave("volcano_panel_paired_legs.tiff", panel, width = 12, height = 10, dpi = 300)Warning: Removed 9456 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 9456 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 10384 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12243 rows containing missing values or values outside the scale range
(`geom_point()`).
ggsave("volcano_panel_paired_legs.pdf", panel, width = 12, height = 10)Warning: Removed 9456 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 9456 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 10384 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12243 rows containing missing values or values outside the scale range
(`geom_point()`).
Each point represents a gene. The x-axis shows the log₂ fold change (log₂FC) of expression, while the y-axis shows the –log₁₀ of the adjusted p-value (padj).
- Blue points represent significantly upregulated genes.
- Dark gray points represent significantly downregulated genes.
- Light gray points are not significant (padj > 0.05 or |log₂FC| < 1).
Dashed lines represent the thresholds used for significance:
|log₂FC| > 1 and padj < 0.05.
Remeber the factor levels you set above:
dds$condition <- factor(dds$condition, levels = c("untreated","treated"))In my case my reference level is the male proximal segment
1. Male distal vs Male proximal
→ Clear upregulation of a subset of genes in the distal segment. Likely corresponds to chemosensory genes, as this region contains wall-pore sensilla associated with olfaction. 2. Female proximal vs Male proximal → Moderate sex-biased expression in the proximal segment. Suggests subtle transcriptional differences even in regions lacking major sensory structures. 3. Female distal vs Male proximal → Stronger transcriptomic divergence, likely reflecting both segmental differences and sexual dimorphism. Many upregulated genes may relate to tip-pore sensilla and reproductive or behavioral adaptations. 4. Female distal vs Female proximal → Similar patterns to males, supporting the idea of segment-specific expression across sexes. Consistent upregulation in distal segments may point to conserved functional specializations.
Heatmap
# Lets use vst (variance stabilizing transformation)
vsd <- vst(dds_kallisto_collapsed, blind = TRUE)# Obtain the transformed matrix
vsd_mat <- assay(vsd)# Upload transcript-to-gene mapping file
tx2gene# A tibble: 59,757 × 2
TXNAME GENEID
<chr> <chr>
1 STRG.100.1.p2 STRG.100.1
2 STRG.100.1.p1 STRG.100.1
3 STRG.100.1.p3 STRG.100.1
4 STRG.100.1.p4 STRG.100.1
5 STRG.10002.1.p1 STRG.10002.1
6 STRG.10006.1.p1 STRG.10006.1
7 STRG.10007.1.p1 STRG.10007.1
8 STRG.10008.1.p1 STRG.10008.1
9 STRG.10010.3.p2 STRG.10010.3
10 STRG.10010.4.p1 STRG.10010.4
# ℹ 59,747 more rows
colnames(tx2gene) <- c("TXNAME", "GENEID")# Here include the genes you can to plot in the heatmap
specific_transcripts <- c("STRG.36829.3.p2", "STRG.63751.1.p1", "STRG.32492.1.p2", "STRG.29635.1.p1", "STRG.24254.2.p1", "STRG.24262.3.p1", "STRG.38773.1.p1", "STRG.49562.1.p1", "STRG.33523.1.p1", "STRG.33525.2.p1", "STRG.33525.1.p1", "STRG.50993.1.p1", "STRG.26518.1.p1", "STRG.50071.1.p1", "STRG.26560.2.p3", "STRG.14809.2.p1", "STRG.26520.1.p4", "STRG.26541.3.p2", "STRG.33350.1.p1", "STRG.45426.1.p1", "STRG.12677.1.p2", "STRG.62762.1.p1", "STRG.21210.1.p1", "STRG.2167.1.p1", "STRG.63201.1.p1", "STRG.43176.1.p1", "STRG.15456.1.p1", "STRG.37954.1.p1", "STRG.39111.1.p2", "STRG.33530.1.p1", "STRG.33519.1.p3", "STRG.26560.1.p3", "STRG.46044.1.p1", "STRG.49563.1.p1", "STRG.56704.4.p1", "STRG.56704.1.p1", "STRG.52863.1.p1", "STRG.46275.1.p2", "STRG.46044.3.p1", "STRG.46044.2.p1", "STRG.50985.1.p1", "STRG.46031.3.p1", "STRG.56704.3.p1", "STRG.56704.2.p1", "STRG.7561.1.p2", "STRG.56704.5.p1", "STRG.2001.1.p1", "STRG.19212.3.p1", "STRG.18882.2.p1", "STRG.39429.2.p1", "STRG.33350.2.p1", "STRG.19212.2.p1", "STRG.26540.1.p1", "STRG.65166.1.p1", "STRG.33530.2.p2", "STRG.38773.1.p2", "STRG.33519.2.p1", "STRG.29634.1.p1", "STRG.11765.1.p1", "STRG.12677.1.p1", "STRG.20575.1.p7", "STRG.63712.1.p1", "STRG.38406.1.p1", "STRG.35168.1.p1", "STRG.26541.2.p1", "STRG.13473.1.p1", "STRG.5450.2.p1", "STRG.1974.1.p1", "STRG.50992.1.p1", "STRG.35168.2.p1", "STRG.29634.2.p1", "STRG.11600.1.p2", "STRG.11765.2.p1", "STRG.11767.1.p1", "STRG.11770.2.p1", "STRG.12825.1.p2", "STRG.13469.1.p1", "STRG.1588.1.p1", "STRG.18882.1.p1", "STRG.20574.1.p2", "STRG.21136.2.p1", "STRG.2159.1.p1", "STRG.27354.1.p1", "STRG.29632.1.p1", "STRG.32906.1.p1", "STRG.32908.1.p3", "STRG.33520.1.p1", "STRG.38401.1.p1", "STRG.38402.1.p1", "STRG.38405.1.p1", "STRG.4504.2.p1", "STRG.45427.1.p1", "STRG.45427.2.p1", "STRG.50989.1.p2", "STRG.56820.4.p1", "STRG.61662.1.p1", "STRG.63752.1.p1", "STRG.63755.1.p1", "STRG.7818.1.p1", "STRG.26555.1.p1", "STRG.46031.1.p1", "STRG.52863.2.p1", "STRG.52918.5.p1", "STRG.52927.1.p1", "STRG.52929.1.p1", "STRG.5450.2.p2", "STRG.5450.2.p3", "STRG.56703.1.p1", "STRG.65166.2.p1", "STRG.7545.1.p1", "STRG.7819.1.p1",
"TRINITY_DN1025_c0_g1_i2_S412_S2.p1_replace_STRG.7561.1.p2","TRINITY_DN1042_c0_g1_i12_S498_S3.p1_replace_STRG.26520.1.p4","TRINITY_DN116590_c0_g1_i1_S419_S8.p1_replace_STRG.26541.3.p2", "TRINITY_DN12670_c0_g1_i3_S414_S3.p1_replace_STRG.50993.1.p1", "TRINITY_DN21302_c0_g1_i1_S498_S3.p1_replace_STRG.4504.2.p1",
"TRINITY_DN5107_c0_g1_i3_S417_S6.p1_replace STRG.26540.1.p1"
) # Filter and map to gene_id
mapped_gene_ids <- unique(tx2gene$GENEID[tx2gene$TXNAME %in% specific_transcripts])sum(mapped_gene_ids %in% rownames(vsd_mat)) # Esto debe ser > 0[1] 104
vsd_mat_subset <- vsd_mat[rownames(vsd_mat) %in% mapped_gene_ids, ]library(RColorBrewer)
library(pheatmap)
pheatmap(vsd_mat_subset,
scale = "row",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "complete",
color = colorRampPalette(rev(brewer.pal(n = 9, name = "RdBu")))(100),
fontsize_row = 6,
show_rownames = FALSE,
main = "Expression heatmap of chemosensory genes")# We scale by row (gen) as in pheatmap
mat_scaled <- t(scale(t(vsd_mat_subset)))#setting colors to make it pretty, or not?
#col_fun <- colorRamp2(c(-2, 0, 2), c("blue", "white", "red"))
#col_fun <- circlize::colorRamp2(c(-2, 0, 2), c("#5e4fa2", "white", "#d53e4f"))
#col_fun <- circlize::colorRamp2(c(-2, 0, 2), c(rev(brewer.pal(n=3, name="RdBu"))))
#col_fun <- circlize::colorRamp2(c(-2, 0, 2), rev(brewer.pal(n = 3, name = "RdBu")))
#col_fun <- circlize::colorRamp2(c(-2, 0, 2), brewer.pal(n = 3, name = "YlGnBu"))
col_fun <- circlize::colorRamp2(c(-2, 0, 2), brewer.pal(n = 3, name = "YlGnBu"))I found this website usefult to choose colors for the heatmap:
https://jokergoo.github.io/ComplexHeatmap-reference/book/a-single-heatmap.html https://www.modb.pro/db/451172 https://r-graph-gallery.com/38-rcolorbrewers-palettes.html
Heatmap de Genes de Interés (Método Avanzado)
# === 1. DATA PREPARATION ===
# You already have `vsd_mat_subset`.
# Now, instead of just using it, let's verify it first.
# The key is to check if the matrix has rows before passing it to clusGap.
# This is the step you should run:
mapped_gene_ids <- unique(tx2gene$GENEID[tx2gene$TXNAME %in% specific_transcripts])
vsd_mat_subset <- vsd_mat[rownames(vsd_mat) %in% mapped_gene_ids, ]
# === Diagnostics ===
if (nrow(vsd_mat_subset) == 0) {
stop("Error: The data matrix `vsd_mat_subset` is empty.
This means none of the genes in your `specific_transcripts` list
could be mapped to the rownames of your `vsd_mat` matrix.
Please check the gene ID matches.")
}# Fin del diagnóstico
# === End of diagnosis ===
# 1.1. Scaling the matrix to z-scores
z_score_mat <- t(scale(t(vsd_mat_subset)))
z_score_mat[is.na(z_score_mat)] <- 0# 1.2. Find the optimal number of clusters with Gap Statistic
library(factoextra)
library(cluster)
set.seed(123)
gap_stat <- clusGap(z_score_mat, FUN = hcut, K.max = min(10, nrow(z_score_mat) - 1), B = 50)
k_optimo <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"])
cat("The optimal number of clusters (k) according to the Gap Statistic method is:", k_optimo, "\n")The optimal number of clusters (k) according to the Gap Statistic method is: 3
# compute the Gap
library(factoextra)
library(cluster)
set.seed(123)
gap_stat <- clusGap(z_score_mat, FUN = hcut, K.max = min(10, nrow(z_score_mat) - 1), B = 50)
# print the table for Gap Statistic
print(gap_stat$Tab) logW E.logW gap SE.sim
[1,] 5.072107 5.506959 0.4348519 0.01032245
[2,] 5.004108 5.453400 0.4492926 0.01091243
[3,] 4.955620 5.412368 0.4567477 0.01163410
[4,] 4.911396 5.378265 0.4668688 0.01137762
[5,] 4.882021 5.347967 0.4659456 0.01132089
[6,] 4.853074 5.320768 0.4676936 0.01123237
[7,] 4.828459 5.295135 0.4666762 0.01121097
[8,] 4.799889 5.270846 0.4709568 0.01090250
[9,] 4.775171 5.247896 0.4727252 0.01070766
[10,] 4.752579 5.225780 0.4732014 0.01096244
num_clusters_optimo <- 4We chose num_clusters_optimo <- 4 for a combination of statistical evidence and biological interpretability.
Statistical Justification from the Gap Statistic:
The Gap Statistic is a method used to estimate the optimal number of clusters (k). It works by comparing the within-cluster dispersion of your data to the expected dispersion under a null hypothesis of no clustering.
While the gap values continued to increase slightly after k=4, the increase was very gradual, suggesting that the clustering quality does not significantly improve with more than four clusters. This makes k=4 a robust and statistically defensible choice.
The k=4 heatmap provided a more nuanced and detailed view. It not only confirmed the primary distal/proximal pattern but also successfully isolated a smaller, highly interesting group of genes. This group showed a specific expression pattern primarily in the Male_distal samples, which aligns perfectly with your hypothesis about sex-specific gene expression.
In summary, choosing k=4 allowed us to find a balance between the statistical recommendations and the biological questions we were trying to answer. It gave us a clean, interpretable heatmap that confirmed the dominant patterns and, most importantly, revealed the specific, sex-related pattern we were looking for.
Perform hierarchical clustering
# Perform hierarchical clustering
d <- dist(z_score_mat, method = "euclidean")
final_clust <- hclust(d, method = "ward.D2")
# Cut the tree into the optimal number of clusters (num_clusters_optimo)
gene_clusters <- cutree(final_clust, k = num_clusters_optimo)
# Create the annotation data.frame for ComplexHeatmap
gene_clusters_df <- data.frame(
Cluster = factor(paste("Grupo", gene_clusters, sep = "_")),
row.names = names(gene_clusters)
)Goal:
Our objective is to group genes with similar expression patterns into clusters. This helps us identify sets of genes that might be co-regulated or involved in similar biological processes.
Hierarchical clustering is a method of grouping data points based on their similarity. Unlike other methods like K-means, it doesn’t require us to know how many clusters we want to find in advance. Instead, it builds a “tree” of clusters that we can later examine to decide on the optimal number of groups.
Next, the algorithm starts with each gene as its own cluster. It then repeatedly merges the two closest clusters together until all genes are in a single, large cluster. The hclust() function handles this process.
We specify a linkage criterion to decide which clusters to merge. We use "ward.D2" (Ward’s method), which is a common choice that aims to minimize the variance within each cluster as they are merged.
Determine the Optimal Number of Clusters (k):
The result of hierarchical clustering is a dendrogram (a tree-like diagram). We can visually inspect this to find a good number of clusters, but a statistical method is more robust.
The Gap Statistic is a popular method that helps us find the optimal k by comparing the compactness of our clusters to a random null distribution.
“Cut” the Tree to Create the Final Clusters:
Once we have decided on the optimal number of clusters (in our case, k=4), we “cut” the dendrogram at that level. The cutree() function does this for us, assigning each gene to one of the k clusters.
# Step 3.1: Define column annotation with your conditions
# Get the names of your 4 conditions directly from the data
conditions <- levels(dds_kallisto_collapsed$condition)
# Create a color vector using the "Pastel1" palette from RColorBrewer
# The "Pastel1" palette has 9 colors, so we can use 4 without issues
# Assign the condition names to the colors
colores_condiciones <- setNames(
brewer.pal(n = 4, "Pastel1"),
conditions
)
# You can check the assigned colors
print(colores_condiciones) Male_distal Female_distal Female_proximal Male_proximal
"#FBB4AE" "#B3CDE3" "#CCEBC5" "#DECBE4"
# Create the column annotation object
top_ann <- HeatmapAnnotation(
Condition = dds_kallisto_collapsed$condition,
col = list(Condition = colores_condiciones),
gp = gpar(col = "black")
)# Step 3.2: Define row annotation (gene clusters)
# `num_clusters_optimo` is the variable you defined in the previous step.
num_clusters_optimo <- length(levels(gene_clusters_df$Cluster))
# Create colors for the clusters.
# If the number of clusters is 3 or more, we use the "Paired" palette.
# The Paired palette has a minimum of 3 colors and a maximum of 12.
colores_clusters <- setNames(
brewer.pal(num_clusters_optimo, "Paired"),
levels(gene_clusters_df$Cluster)
)
# Create the row annotation object
left_ann <- ComplexHeatmap::rowAnnotation(
Cluster = gene_clusters_df$Cluster,
col = list(Cluster = colores_clusters),
show_annotation_name = FALSE
)# Step 3.5: Reorder the levels of the sample conditions
# This line modifies the DESeq2 object `dds_kallisto_collapsed`.
# Define the new desired order
new_condition_order <- c("Female_distal", "Female_proximal", "Male_distal", "Male_proximal")
# Reorder the levels of the 'condition' factor
dds_kallisto_collapsed$condition <- factor(dds_kallisto_collapsed$condition, levels = new_condition_order)
# Verify the new order of the levels
print(levels(dds_kallisto_collapsed$condition))[1] "Female_distal" "Female_proximal" "Male_distal" "Male_proximal"
# Now, the `column_order` object in your Step 4 will use this new order.# Step 4: Draw the Heatmap with ComplexHeatmap and display the plot
# Order the columns to group the samples by condition
column_order <- order(dds_kallisto_collapsed$condition)
ComplexHeatmap::Heatmap(
z_score_mat,
name = "Relative\nexpression\n(z score)",
col = col_fun,
cluster_rows = TRUE, # Keep the default clustering
row_split = gene_clusters_df$Cluster, # Use the cluster factor
cluster_columns = FALSE,
column_order = column_order,
top_annotation = top_ann,
left_annotation = left_ann,
show_row_names = FALSE,
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 8),
heatmap_legend_param = list(title = "Relative\nexpression\n(z score)",
at = c(-2, 0, 2),
labels = c("-2", "0", "2"))
)pdf("heatmap_avanzado_4_clusters_Pastel1_Paired.pdf", width = 8, height = 10)# Step 4.1: Save the heatmap as a TIFF file
# Open the TIFF graphics device with the desired specifications
tiff("heatmap_avanzado_4_clusters_Pastel1_Paired.tiff", width = 13, height = 10, units = "in", res = 300)
ComplexHeatmap::Heatmap(
z_score_mat,
name = "Relative\nexpression\n(z score)",
col = col_fun,
cluster_rows = TRUE,
row_split = gene_clusters_df$Cluster,
cluster_columns = FALSE,
column_order = column_order,
top_annotation = top_ann,
left_annotation = left_ann,
show_row_names = FALSE,
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 8),
heatmap_legend_param = list(title = "Relative\nexpression\n(z score)",
at = c(-2, 0, 2),
labels = c("-2", "0", "2"))
)
# Close the TIFF graphics device to complete file creation
dev.off()quartz_off_screen
2
# Step 4.2: Save the heatmap as a PDF file
# Open the PDF graphics device with the desired specifications
pdf("heatmap_avanzado_4_clusters_Pastel1_Paired.pdf", width = 13, height = 10)
ComplexHeatmap::Heatmap(
z_score_mat,
name = "Relative\nexpression\n(z score)",
#col = col_fun,
cluster_rows = TRUE,
row_split = gene_clusters_df$Cluster,
cluster_columns = FALSE,
column_order = column_order,
top_annotation = top_ann,
left_annotation = left_ann,
show_row_names = FALSE,
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 8),
heatmap_legend_param = list(title = "Relative\nexpression\n(z score)",
at = c(-2, 0, 2),
labels = c("-2", "0", "2"))
)
# Close the PDF graphics device to complete file creation
dev.off()quartz_off_screen
2
# Step 4: Draw the Heatmap with visible gene names
# Order the columns to group the samples by condition
column_order <- order(dds_kallisto_collapsed$condition)
pdf("heatmap_avanzado_4_clusters_Pastel1_Paired_names.pdf", width = 17, height = 10)
ComplexHeatmap::Heatmap(
z_score_mat,
name = "Relative\nexpression\n(z score)",
col = col_fun,
cluster_rows = TRUE,
row_split = gene_clusters_df$Cluster,
cluster_columns = FALSE,
column_order = column_order,
top_annotation = top_ann,
left_annotation = left_ann,
show_row_names = TRUE, # <--- CHANGE HERE: names will now be displayed
row_names_gp = gpar(fontsize = 4), # <--- OPTIONAL: adjust font size
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 8),
heatmap_legend_param = list(title = "Relative\nexpression\n(z score)",
at = c(-2, 0, 2),
labels = c("-2", "0", "2"))
)
dev.off()quartz_off_screen
2
# Create a table that combines gene names with their clusters
# Make sure the gene order matches the order in the matrix
gene_cluster_mapping <- data.frame(
GeneID = rownames(z_score_mat),
Cluster = gene_clusters_df$Cluster
)
# Print the complete table
print(gene_cluster_mapping) GeneID Cluster
1 STRG.11600.1 Grupo_1
2 STRG.11765.2 Grupo_2
3 STRG.11770.2 Grupo_1
4 STRG.12677.1 Grupo_2
5 STRG.12825.1 Grupo_1
6 STRG.13469.1 Grupo_2
7 STRG.13473.1 Grupo_1
8 STRG.14809.2 Grupo_2
9 STRG.15456.1 Grupo_2
10 STRG.1588.1 Grupo_1
11 STRG.18882.1 Grupo_1
12 STRG.18882.2 Grupo_3
13 STRG.19212.2 Grupo_4
14 STRG.19212.3 Grupo_2
15 STRG.1974.1 Grupo_2
16 STRG.2001.1 Grupo_2
17 STRG.20574.1 Grupo_2
18 STRG.20575.1 Grupo_2
19 STRG.21136.2 Grupo_4
20 STRG.21210.1 Grupo_2
21 STRG.2159.1 Grupo_2
22 STRG.2167.1 Grupo_1
23 STRG.24254.2 Grupo_1
24 STRG.24262.3 Grupo_4
25 STRG.26518.1 Grupo_4
26 STRG.26541.2 Grupo_4
27 STRG.26555.1 Grupo_1
28 STRG.26560.1 Grupo_2
29 STRG.26560.2 Grupo_1
30 STRG.27354.1 Grupo_2
31 STRG.29632.1 Grupo_2
32 STRG.29634.1 Grupo_1
33 STRG.29634.2 Grupo_3
34 STRG.29635.1 Grupo_3
35 STRG.32492.1 Grupo_2
36 STRG.32906.1 Grupo_2
37 STRG.32908.1 Grupo_3
38 STRG.33350.1 Grupo_3
39 STRG.33350.2 Grupo_3
40 STRG.33519.1 Grupo_3
41 STRG.33519.2 Grupo_2
42 STRG.33520.1 Grupo_2
43 STRG.33523.1 Grupo_1
44 STRG.33525.1 Grupo_1
45 STRG.33525.2 Grupo_2
46 STRG.33530.1 Grupo_3
47 STRG.33530.2 Grupo_3
48 STRG.35168.1 Grupo_3
49 STRG.35168.2 Grupo_2
50 STRG.36829.3 Grupo_1
51 STRG.37954.1 Grupo_2
52 STRG.38401.1 Grupo_1
53 STRG.38402.1 Grupo_4
54 STRG.38405.1 Grupo_2
55 STRG.38406.1 Grupo_2
56 STRG.38773.1 Grupo_1
57 STRG.39111.1 Grupo_2
58 STRG.39429.2 Grupo_1
59 STRG.43176.1 Grupo_1
60 STRG.45426.1 Grupo_4
61 STRG.45427.1 Grupo_4
62 STRG.45427.2 Grupo_1
63 STRG.46031.1 Grupo_2
64 STRG.46031.3 Grupo_3
65 STRG.46044.1 Grupo_2
66 STRG.46044.2 Grupo_2
67 STRG.46044.3 Grupo_2
68 STRG.46275.1 Grupo_4
69 STRG.49562.1 Grupo_4
70 STRG.49563.1 Grupo_4
71 STRG.50071.1 Grupo_2
72 STRG.50985.1 Grupo_3
73 STRG.50989.1 Grupo_3
74 STRG.50992.1 Grupo_3
75 STRG.52863.1 Grupo_1
76 STRG.52863.2 Grupo_1
77 STRG.52918.5 Grupo_2
78 STRG.52927.1 Grupo_1
79 STRG.52929.1 Grupo_4
80 STRG.5450.2 Grupo_4
81 STRG.56703.1 Grupo_4
82 STRG.56704.1 Grupo_4
83 STRG.56704.2 Grupo_1
84 STRG.56704.3 Grupo_4
85 STRG.56704.4 Grupo_4
86 STRG.56704.5 Grupo_4
87 STRG.56820.4 Grupo_2
88 STRG.61662.1 Grupo_1
89 STRG.62762.1 Grupo_2
90 STRG.63201.1 Grupo_1
91 STRG.63712.1 Grupo_2
92 STRG.63751.1 Grupo_1
93 STRG.63752.1 Grupo_2
94 STRG.63755.1 Grupo_1
95 STRG.65166.1 Grupo_2
96 STRG.65166.2 Grupo_2
97 STRG.7545.1 Grupo_2
98 STRG.7818.1 Grupo_1
99 STRG.7819.1 Grupo_1
100 TRINITY_DN1025_c0_g1_i2_S412_S2.p1_replace_STRG.7561 Grupo_4
101 TRINITY_DN1042_c0_g1_i12_S498_S3.p1_replace_STRG.26520 Grupo_4
102 TRINITY_DN116590_c0_g1_i1_S419_S8.p1_replace_STRG.26541 Grupo_4
103 TRINITY_DN12670_c0_g1_i3_S414_S3.p1_replace_STRG.50993 Grupo_3
104 TRINITY_DN21302_c0_g1_i1_S498_S3.p1_replace_STRG.4504 Grupo_4
# Or, if you prefer to see one cluster at a time, for example, Grupo_1
# print(gene_cluster_mapping[gene_cluster_mapping$Cluster == "Grupo_1", ])