View source on GitHub

1. Introduction

This is an R Markdown Notebook. The main package used to generate the heatmap is ‘pheatmap’ in RStudio. Use updateR() if needed from the ‘installr’ package.

version
getRversion()
options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("installr")
library(installr)
install.packages("rstudioapi")
library(rstudioapi)
rstudio_version <- rstudioapi::versionInfo()$version
cat("RStudio version used:", as.character(rstudio_version), "\n")

1.1. Load the necessary packages

install.packages("pheatmap")
library(pheatmap)
install.packages("extrafont")
library(extrafont)
font_import(pattern="[A/a]rial")
install.packages("tidyverse") #include ggplot2, tidyr and dplyr
library(tidyverse)
install.packages("RColorBrewer")
library(RColorBrewer)
install.packages("compositions")
library(compositions)

1.2. References

citation()
version$version.string
citation("pheatmap")
packageVersion("pheatmap")
#https://cran.r-project.org/web/packages/pheatmap/

1.3. Set and get working directory

By default, the working directory for R code chunks is the directory that contains the Rmd document.

getwd()

2. Metadata

The metadata file contains the sample names as rows and sample features as columns.

2.1. Load the metadata file

Rename function from dplyr package.

metadata <- read.csv("GH22_combined_Metadata_withsimple_Finetuned_Productivity4.csv", check.names = FALSE)
colnames(metadata)
rownames(metadata)
metadata
metadata <- metadata %>% rename(Sample_Name = Name)
metadata <- metadata %>% rename(Metabolism = Fermentation_phase_finetuned)
colnames(metadata)
listmet <- metadata$Metabolism
listmet
listmetunique <- unique(listmet)
listmetunique

Specify row names and select samples if required. For the article ““, we select samples from reactors #1 and #2, BSG samples and inoculum samples (102 samples, without R3 and R4 reactors).

row.names(metadata) <- metadata$Sample_Name
metadataarticle <- metadata %>% filter(grepl('R2|R1|/', Reactor))
metadataarticle[1:10,1:10]
listarticle <- metadataarticle$Sample_Name
listarticle
listmetabolism <- metadataarticle$Metabolism
listmetabolism
unique(listmetabolism)
colnames(metadataarticle)

Add metabolic stages for homoacetogenesis and Elong_H2CO2 (auto. vs. mixo.) in a new column “Main_Metabolic_Phase”

metadataarticle$Main_Metabolic_Phase <- listmetabolism
metadataarticle$Main_Metabolic_Phase
colnames(metadataarticle)

old_name <- "Homoacetogenesis"
# Specify the new name
new_name <- "Homoacetogenesis_Mixo"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Homoacetogenesis"
# Specify the new name
new_name <- "Homoacetogenesis_Auto"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Elong_H2CO2"
# Specify the new name
new_name <- "Elong_H2CO2_Mixo"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Elong_H2CO2"
# Specify the new name
new_name <- "Elong_H2CO2_Auto"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase
unique(metadataarticle$Main_Metabolic_Phase)
unique(listmetabolism)

2.2. Metadata file cleaning

colnames(metadataarticle)
metadataarticle2 <- metadataarticle[, !colnames(metadataarticle) %in% c("Sample","Reverse_primer","Barcode_i7","Index i7 Leuven","Experiment","Replicate","Name","Sample_ID")]
colnames(metadataarticle2)
metadataarticle2[1:10,1:10]
metadataarticle2$Fermentation_phase_simplified
metadataarticle2$Metabolism
unique(metadataarticle2$Fermentation_phase_simplified)
unique(metadataarticle2$Metabolism)
unique(metadataarticle2$Metabolism_Carbon)
unique(metadataarticle2$Main_Metabolic_Phase)

2.3. Metadata file subsetting

Without lag phases, and the inoculum/BSG samples

metadataarticle3 <- metadataarticle2 %>% filter(grepl('Elong_H2CO2_Auto|Homoacetogenesis_Auto|Homoacetogenesis_Mixo|Elong_H2CO2_Mixo|Elong_EtOH&Lactate|Acidogenesis', Main_Metabolic_Phase))
rownames(metadataarticle3)

#With Methanogens
metadataarticle4 <- metadataarticle2 %>% filter(grepl('Elong_H2CO2_Auto|Homoacetogenesis_Auto|Homoacetogenesis_Mixo|Elong_H2CO2_Mixo|Elong_EtOH&Lactate|Acidogenesis|Methanogenesis', Main_Metabolic_Phase))
rownames(metadataarticle4)

3. Read the abundance table (taxa)

The abundance file contains the ASV numbers as rows and sample names as columns. The abundance table contains 123 samples and 1866 ASVs.

taxa <- read.table(file = "feature_table_final_rarefied.tsv", sep = "\t", header = TRUE, check.names=F)
taxa
head(taxa[, 125], 20)
colnames(taxa)
head(rownames(taxa), 20)
length(rownames(taxa))
dim(taxa)

4. Read the taxonomy table (taxonomic classification)

The taxonomy file contains the ASV identifiers as rows and taxonomic ranks as columns, from Kingdom to Genus. The ASV_table file was extracted from the rarefied abundance table.

taxanames <- read.table(file = "ASV_table.csv", sep = ",", header = TRUE, check.names=F)
asv_table <- taxanames
asv_table
colnames(asv_table)
df <- separate(asv_table, taxonomy, into = c("Kingdom", "Phylum", "Class", "Order","Family","Genus"), sep = ";", convert = TRUE)
colnames(df)
df[1:50,1:7]
class(df)
fill_value <- "NA"
df$Family[df$Family == ""] <- fill_value
df$Kingdom[df$Kingdom == ""] <- fill_value
df$Phylum[df$Phylum == ""] <- fill_value
df$Class[df$Class == ""] <- fill_value
df$Order[df$Order == ""] <- fill_value
df$Genus[df$Genus == ""] <- fill_value
df[1:50,1:7]
df$Genus <- df$Genus %>% replace_na('NA')
df$Family <- df$Family %>% replace_na('NA')
df$Class <- df$Class %>% replace_na('NA')
df$Order <- df$Order %>% replace_na('NA')
df$Phylum <- df$Phylum %>% replace_na('NA')
df$Kingdom <- df$Kingdom %>% replace_na('NA')
df[1:50,1:7]
old_char <- "NA"
new_char <- "Unclassified"
df_replaced <- data.frame(lapply(df, function(x) gsub(old_char, new_char, x)))
df_replaced
df_replaced[1:50,1:7]
asv_table2 <- df_replaced
asv_table2

Create “ASV_#” and “ASV_#_Genus” identifiers

listnumber <- asv_table$FEATURE_ID
head(listnumber,20)
#Pay attention: This line defines a function called modify_vector_elements that takes two arguments: vec (the input vector) and ASV (the Additional String Vector).
#ASV_Additional String Vector
modify_vector_elements <- function(vec, ASV) {
  modified_vec <- paste0(ASV, vec)
  return(modified_vec)
}
FEATURE_ID2 <- modify_vector_elements(listnumber, "ASV_")
head(FEATURE_ID2,20)

length(FEATURE_ID2)
length(listnumber)

row.names(asv_table2) <- FEATURE_ID2
asv_table2
asv_table2 <- asv_table2[,-1]
colnames(asv_table2)
asv_table2$ASV <- rownames(asv_table2)
colnames(asv_table2)
asv_table2$ASV_Genus <- paste(asv_table2$ASV, asv_table2$Genus, sep = "_")
colnames(asv_table2)
head(asv_table2$ASV_Genus,20)
asv_table2$ASV_Genus <- gsub("g__", "", asv_table2$ASV_Genus)
colnames(asv_table2)
head(asv_table2$ASV_Genus,20)

listnumber2 <- asv_table2$ASV_Genus
head(listnumber2,20)

5. Object summary and characteristics

From the summary of abundance table, the value of 0.5359 corresponds to the mean, i.e., 1000 counts / 1866 detected ASVs.

is.object(metadataarticle2)
is.object(taxa)
is.object(asv_table2)

class(metadataarticle2)
class(taxa)
class(asv_table2)

summary(metadataarticle2)
head(row.names(metadataarticle2),20)
head(colnames(metadataarticle2),20)
dim(taxa)
summary(taxa)
length(unique(taxa$FEATURE_ID))
max(taxa$FEATURE_ID)
min(taxa$FEATURE_ID)
head(row.names(taxa),20)
head(colnames(taxa),20)
summary(asv_table2)
head(row.names(asv_table2),20)
head(colnames(asv_table2),20)

Change column names in taxa to match row names of metadata (before selecting samples) and change ASV identifiers using ASV_#_Genus.

taxa
colnames(taxa)
head(taxa$FEATURE_ID,25)
head(taxa$taxonomy,25)
taxa$ASV_Genus <- listnumber2
head(taxa$ASV_Genus,25)
head(row.names(taxa),20)
row.names(taxa) <- taxa$ASV_Genus
colnames(taxa)
taxa <- taxa[,-1]
colnames(taxa)
taxa <- taxa[,-124]
taxa <- taxa[,-124]
colnames(taxa)
taxa[1:10,1:10]

#Change column names of taxa based on row names of metadata. Distinctions between Hyphen or Dash, and Underscores! 
colnames(taxa)
dim(taxa)
rownames(metadata)
dim(metadata)

colnames(taxa) <- rownames(metadata)
colnames(taxa)

#the identical() function in R does check both the values and their order.
identical(colnames(taxa), rownames(metadata))

6. Sample selection for heatmap construction

To construct the heatmap, we select metadataarticle3 including the metabolic phases: Elong_H2CO2_Auto, Homoacetogenesis_Auto, Homoacetogenesis_Mixo, Elong_H2CO2_Mixo, Elong_EtOH&Lactate, and Acidogenesis.

If methanogenesis included, make reference to metadataarticle4.

columns_to_keep <- rownames(metadataarticle3)
columns_to_keep

columns_to_keep2 <- rownames(metadataarticle4)
columns_to_keep2

# Subset the dataframe to include only the specified columns
taxa_subset <- taxa[, columns_to_keep]
taxa_subset
dim(taxa_subset)

taxa_subset2 <- taxa[, columns_to_keep2]
taxa_subset2
dim(taxa_subset2)
taxa_subset2[1:20,1:20]

head(rowSums(taxa),20)
head(rowSums(taxa_subset),20)
head(rowSums(taxa_subset2),20)

length(rowSums(taxa))
length(rowSums(taxa_subset))
length(rowSums(taxa_subset2))

In taxa file, select ASVs with a relative abundance of 0 in all samples.

asvs_zero_sum <- rownames(taxa)[rowSums(taxa) == 0]
length(asvs_zero_sum)

asvs_zero_sum2 <- rownames(taxa_subset)[rowSums(taxa_subset) == 0]
length(asvs_zero_sum2)
asvs_zero_sum2

asvs_zero_sum3 <- rownames(taxa_subset2)[rowSums(taxa_subset2) == 0]
length(asvs_zero_sum3)

In taxa file, select ASVs with a relative abundance greather than 0 in all samples. 635 ASVs detected in Elong_H2CO2_Auto, Homoacetogenesis_Auto, Homoacetogenesis_Mixo, Elong_H2CO2_Mixo, Elong_EtOH&Lactate, and Acidogenesis.

asvs_zero_sum4 <- rownames(taxa_subset)[rowSums(taxa_subset) != 0]
length(asvs_zero_sum4)
asvs_zero_sum4

Select ASVs greater than zero in taxa_subset

taxa_subset
asvs_zero_sum4
class(taxa_subset)
class(asvs_zero_sum4)

selected_rows <- taxa_subset[rownames(taxa_subset) %in% asvs_zero_sum4, ]
selected_rows
dim(selected_rows)

summary(selected_rows)

asv_counts_per_sample <- apply(selected_rows, 2, function(x) sum(x > 0))
asv_counts_per_sample

If you want to determine how many times an ASV is detected in selected samples, you can use the R expression function(x) sum(x > 0). This defines an anonymous function that counts the number of elements in x that are greater than 0.

asv_detection_counts <- apply(selected_rows, 1, function(x) sum(x > 0))
asv_detection_counts
asv_detection_counts_sorted <- sort(asv_detection_counts, decreasing = TRUE)
asv_detection_counts_sorted

7. Annotations and color palettes for heatmap construction

#Define annotations
df <- metadataarticle3[,c("Main_Metabolic_Phase","Metabolism_Carbon","Reactor")]
colnames(df)
df
class(df)

df2 <- metadataarticle4[,c("Main_Metabolic_Phase","Metabolism_Carbon","Reactor")]
colnames(df2)

#Color palette
mypal5 <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100)
mypal5

length(mypal5)

# Visualize the palette as a horizontal strip
image(1:100, 1, as.matrix(1:100), col = mypal5, axes = FALSE, xlab = "", ylab = "")

annotation_colors <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571"  )
)

annotation_colors2 <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571",
    Methanogenesis = "blue")
)

annotation_colors3 <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571"  ),
  Phylum = c(
  "p__Actinobacteriota"             = "#ff0000",
  "p__Bacteroidota"                 = "#000000",
  "p__Firmicutes"                   = "#ffff00")
)

To add row annotations in the heatmap.

asv_annotation <- asv_table2[, c("ASV_Genus", "Phylum")]
rownames(asv_annotation) <- asv_annotation$ASV_Genus
head(asv_annotation)
asv_annotation <- asv_annotation[, "Phylum", drop = FALSE]
head(asv_annotation)
unique(asv_annotation$Phylum)

identical(row.names(asv_annotation),row.names(taxa_subset))

class(asv_annotation)

taxatop25f
list <- row.names(taxatop25f)
list
asv_annotation
rownames(asv_annotation)
dim(asv_annotation)

asv_annotation_subset <- asv_annotation[rownames(asv_annotation) %in% rownames(taxatop25f), , drop = FALSE]
asv_annotation_subset
class(asv_annotation_subset)

8. Select the TOP25/30 taxa in all selected samples

taxa_subset
taxa_subsett <- t(taxa_subset)
taxa_subsett_decr <- taxa_subsett[,order(colSums(taxa_subsett),decreasing=TRUE)]
rownames(taxa_subsett_decr)
colnames(taxa_subsett_decr)
top25list <- colnames(taxa_subsett_decr)[1:25]
top25list
taxatop25 <- data.frame(taxa_subsett_decr[,colnames(taxa_subsett_decr) %in% top25list], check.names = FALSE)
taxatop25
taxatop25f <- t(taxatop25)
taxatop25f
colnames(taxatop25f)
colSums(taxatop25f)
rowSums(taxatop25f)
write.csv(taxatop25f, "taxatop25f.csv", row.names = TRUE)

w <- taxatop25f+1
write.csv(w, "taxatop25fwithone.csv", row.names = TRUE)

taxa_subset2
taxa_subset2t <- t(taxa_subset2)
taxa_subset2t_decr <- taxa_subset2t[,order(colSums(taxa_subset2t),decreasing=TRUE)]
rownames(taxa_subset2t_decr)
colnames(taxa_subset2t_decr)
top30listm <- colnames(taxa_subset2t_decr)[1:30]
top30listm
taxatop30m <- data.frame(taxa_subset2t_decr[,colnames(taxa_subset2t_decr) %in% top30listm], check.names = FALSE)
taxatop30m
taxatop30mf <- t(taxatop30m)
taxatop30mf
colSums(taxatop30mf)

9. Data transformation and heatmap construction

Add 1 to your rarefied counts before log transformation: log(x + 1) is a standard preprocessing step.

taxatop25nozero <- taxatop25f+1
write.csv(taxatop25nozero, "taxatop25fnozero_2.csv", row.names = TRUE)

range(taxatop25nozero, na.rm = TRUE)

Euclidean distances are used by default in pheatmap, for both rows and columns (clustering_distance_rows = “euclidean”; clustering_distance_cols = “euclidean”)

9.1. Log2 transformation

LOG2(1) = 0 LOG2(0) undefined

datlog2 <- log2(taxatop25f+1)

range(datlog2, na.rm = TRUE)
summary(as.vector(datlog2))

str(datlog2)
is.matrix(datlog2)
is.numeric(datlog2)

This command uses the pheatmap package in R to generate a heatmap and stores the result (including dendrogram information) in the object plog2_average. pheatmap() uses here average linkage (UPGMA) for hierarchical clustering of rows and columns. cutree_cols = 7 cuts the column dendrogram into 7 clusters. This means: Columns (samples) will be grouped into 7 clusters, and these clusters are shown with colored bars based on defined annotations. cutree_rows = 10 cuts the row dendrogram into 10 clusters. This groups the taxa (rows) into 10 clusters, and helps visually identify groups of similar features.

tiff("plog2_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(datlog2, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
tiff("plog2_average_2_rows.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(datlog2, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_row=asv_annotation_subset, annotation_colors = annotation_colors3, cutree_cols = 7, cutree_rows = 10)
dev.off()

Define scale of color palette.

my_breaks <- seq(0, 10, by = 0.5)
mypal6 <- colorRampPalette(RColorBrewer::brewer.pal(9, "YlGnBu"))(length(my_breaks) - 1)
mypal6 <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(20)
mypal6

length(mypal5)
length(mypal6)

str(my_breaks)
is.numeric(my_breaks)             
all(is.finite(my_breaks))          
length(mypal5) == length(my_breaks) - 1
print(length(mypal5))

tiff("plog2_average_2_scalecolor.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(mat = datlog2, color = mypal6, clustering_method="average", breaks = my_breaks, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()

9.2. Log10 transformation

datlog10 <- log10(taxatop25f+1)

range(datlog10, na.rm = TRUE)
summary(as.vector(datlog10))

str(datlog10)
is.matrix(datlog10)
is.numeric(datlog10)
tiff("plog10_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
plog10_average <- pheatmap(datlog10, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()

9.3. clr transformation

clr (Centered Log-Ratio) transformation from the package compositions. clr uses the geometric mean of all components (in microbiome data, each component in a compositional vector corresponds to a specific ASV or OTU).

taxatop25f
row.names(taxatop25f)
colnames(taxatop25f)

taxatop25f2 <- taxatop25f+1
write.csv(taxatop25f2, "taxatop25fplusone.csv", row.names = TRUE)

datclr <- clr(acomp(taxatop25f+1))

range(datclr, na.rm = TRUE)
summary(as.vector(datclr))

class(datclr)
is.matrix(datclr)
is.numeric(datclr)

datclr_mat <- as.matrix(datclr)
datclr_mat
write.csv(datclr_mat, "datclr_mat.csv", row.names = TRUE)
tiff("pclr_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
pclr_average <- pheatmap(datclr, mypal5, clustering_method="complete", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
tiff("pclr_average_2_mat.tiff", width = 16, height = 8, units = "in", res = 300)
pclr_average <- pheatmap(datclr_mat, mypal5, clustering_method="complete", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
---
title: 'Microbiome analysis: clustered heatmaps'
author: "B. Stenuit, UCLouvain, Earth & Life Institute, Louvain-la-Neuve"
date: "`r Sys.Date()`"
output: html_notebook
---

[View source on GitHub](https://github.com/BenStenuit/Heatmap_Chain_elong)


# 1. <u>Introduction</u>

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. The main package used to generate the heatmap is 'pheatmap' in RStudio. Use updateR() if needed from the 'installr' package.

```{r}
version
getRversion()
options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("installr")
library(installr)
install.packages("rstudioapi")
library(rstudioapi)
rstudio_version <- rstudioapi::versionInfo()$version
cat("RStudio version used:", as.character(rstudio_version), "\n")
```

## 1.1. Load the necessary packages

```{r}
install.packages("pheatmap")
library(pheatmap)
install.packages("extrafont")
library(extrafont)
font_import(pattern="[A/a]rial")
install.packages("tidyverse") #include ggplot2, tidyr and dplyr
library(tidyverse)
install.packages("RColorBrewer")
library(RColorBrewer)
install.packages("compositions")
library(compositions)
```

## 1.2. References

```{r}
citation()
version$version.string
citation("pheatmap")
packageVersion("pheatmap")
#https://cran.r-project.org/web/packages/pheatmap/
```

## 1.3. Set and get working directory

By default, the working directory for R code chunks is the directory that contains the Rmd document.

```{r}
getwd()
```


# 2. <u>Metadata</u>

The metadata file contains the sample names as rows and sample features as columns.

## 2.1. Load the metadata file

Rename function from dplyr package.

```{r}
metadata <- read.csv("GH22_combined_Metadata_withsimple_Finetuned_Productivity4.csv", check.names = FALSE)
colnames(metadata)
rownames(metadata)
metadata
metadata <- metadata %>% rename(Sample_Name = Name)
metadata <- metadata %>% rename(Metabolism = Fermentation_phase_finetuned)
colnames(metadata)
listmet <- metadata$Metabolism
listmet
listmetunique <- unique(listmet)
listmetunique
```

Specify row names and select samples if required.
For the article "", we select samples from reactors #1 and #2, BSG samples and inoculum samples (102 samples, without R3 and R4 reactors).

```{r}
row.names(metadata) <- metadata$Sample_Name
metadataarticle <- metadata %>% filter(grepl('R2|R1|/', Reactor))
metadataarticle[1:10,1:10]
listarticle <- metadataarticle$Sample_Name
listarticle
listmetabolism <- metadataarticle$Metabolism
listmetabolism
unique(listmetabolism)
colnames(metadataarticle)
```

Add metabolic stages for homoacetogenesis and Elong_H2CO2 (auto. vs. mixo.) in a new column "Main_Metabolic_Phase"

```{r}
metadataarticle$Main_Metabolic_Phase <- listmetabolism
metadataarticle$Main_Metabolic_Phase
colnames(metadataarticle)

old_name <- "Homoacetogenesis"
# Specify the new name
new_name <- "Homoacetogenesis_Mixo"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Homoacetogenesis"
# Specify the new name
new_name <- "Homoacetogenesis_Auto"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Elong_H2CO2"
# Specify the new name
new_name <- "Elong_H2CO2_Mixo"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Elong_H2CO2"
# Specify the new name
new_name <- "Elong_H2CO2_Auto"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase
unique(metadataarticle$Main_Metabolic_Phase)
unique(listmetabolism)
```

## 2.2. Metadata file cleaning

```{r}
colnames(metadataarticle)
metadataarticle2 <- metadataarticle[, !colnames(metadataarticle) %in% c("Sample","Reverse_primer","Barcode_i7","Index i7 Leuven","Experiment","Replicate","Name","Sample_ID")]
colnames(metadataarticle2)
metadataarticle2[1:10,1:10]
metadataarticle2$Fermentation_phase_simplified
metadataarticle2$Metabolism
unique(metadataarticle2$Fermentation_phase_simplified)
unique(metadataarticle2$Metabolism)
unique(metadataarticle2$Metabolism_Carbon)
unique(metadataarticle2$Main_Metabolic_Phase)
```

## 2.3. Metadata file subsetting

Without lag phases, and the inoculum/BSG samples

```{r}
metadataarticle3 <- metadataarticle2 %>% filter(grepl('Elong_H2CO2_Auto|Homoacetogenesis_Auto|Homoacetogenesis_Mixo|Elong_H2CO2_Mixo|Elong_EtOH&Lactate|Acidogenesis', Main_Metabolic_Phase))
rownames(metadataarticle3)

#With Methanogens
metadataarticle4 <- metadataarticle2 %>% filter(grepl('Elong_H2CO2_Auto|Homoacetogenesis_Auto|Homoacetogenesis_Mixo|Elong_H2CO2_Mixo|Elong_EtOH&Lactate|Acidogenesis|Methanogenesis', Main_Metabolic_Phase))
rownames(metadataarticle4)
```


# 3. <u>Read the abundance table (taxa)</u>

The abundance file contains the ASV numbers as rows and sample names as columns.
The abundance table contains 123 samples and 1866 ASVs.

```{r}
taxa <- read.table(file = "feature_table_final_rarefied.tsv", sep = "\t", header = TRUE, check.names=F)
taxa
head(taxa[, 125], 20)
colnames(taxa)
head(rownames(taxa), 20)
length(rownames(taxa))
dim(taxa)
```


## 4. <u>Read the taxonomy table (taxonomic classification)</u>

The taxonomy file contains the ASV identifiers as rows and taxonomic ranks as columns, from Kingdom to Genus. The ASV_table file was extracted from the rarefied abundance table.

```{r}
taxanames <- read.table(file = "ASV_table.csv", sep = ",", header = TRUE, check.names=F)
asv_table <- taxanames
asv_table
colnames(asv_table)
df <- separate(asv_table, taxonomy, into = c("Kingdom", "Phylum", "Class", "Order","Family","Genus"), sep = ";", convert = TRUE)
colnames(df)
df[1:50,1:7]
class(df)
fill_value <- "NA"
df$Family[df$Family == ""] <- fill_value
df$Kingdom[df$Kingdom == ""] <- fill_value
df$Phylum[df$Phylum == ""] <- fill_value
df$Class[df$Class == ""] <- fill_value
df$Order[df$Order == ""] <- fill_value
df$Genus[df$Genus == ""] <- fill_value
df[1:50,1:7]
df$Genus <- df$Genus %>% replace_na('NA')
df$Family <- df$Family %>% replace_na('NA')
df$Class <- df$Class %>% replace_na('NA')
df$Order <- df$Order %>% replace_na('NA')
df$Phylum <- df$Phylum %>% replace_na('NA')
df$Kingdom <- df$Kingdom %>% replace_na('NA')
df[1:50,1:7]
old_char <- "NA"
new_char <- "Unclassified"
df_replaced <- data.frame(lapply(df, function(x) gsub(old_char, new_char, x)))
df_replaced
df_replaced[1:50,1:7]
asv_table2 <- df_replaced
asv_table2
```

Create "ASV_#" and "ASV_#_Genus" identifiers

```{r}
listnumber <- asv_table$FEATURE_ID
head(listnumber,20)
#Pay attention: This line defines a function called modify_vector_elements that takes two arguments: vec (the input vector) and ASV (the Additional String Vector).
#ASV_Additional String Vector
modify_vector_elements <- function(vec, ASV) {
  modified_vec <- paste0(ASV, vec)
  return(modified_vec)
}
FEATURE_ID2 <- modify_vector_elements(listnumber, "ASV_")
head(FEATURE_ID2,20)

length(FEATURE_ID2)
length(listnumber)

row.names(asv_table2) <- FEATURE_ID2
asv_table2
asv_table2 <- asv_table2[,-1]
colnames(asv_table2)
asv_table2$ASV <- rownames(asv_table2)
colnames(asv_table2)
asv_table2$ASV_Genus <- paste(asv_table2$ASV, asv_table2$Genus, sep = "_")
colnames(asv_table2)
head(asv_table2$ASV_Genus,20)
asv_table2$ASV_Genus <- gsub("g__", "", asv_table2$ASV_Genus)
colnames(asv_table2)
head(asv_table2$ASV_Genus,20)

listnumber2 <- asv_table2$ASV_Genus
head(listnumber2,20)
```


## 5. <u>Object summary and characteristics</u>

From the summary of abundance table, the value of 0.5359 corresponds to the mean, i.e., 1000 counts / 1866 detected ASVs.

```{r}
is.object(metadataarticle2)
is.object(taxa)
is.object(asv_table2)

class(metadataarticle2)
class(taxa)
class(asv_table2)

summary(metadataarticle2)
head(row.names(metadataarticle2),20)
head(colnames(metadataarticle2),20)
dim(taxa)
summary(taxa)
length(unique(taxa$FEATURE_ID))
max(taxa$FEATURE_ID)
min(taxa$FEATURE_ID)
head(row.names(taxa),20)
head(colnames(taxa),20)
summary(asv_table2)
head(row.names(asv_table2),20)
head(colnames(asv_table2),20)
```

Change column names in taxa to match row names of metadata (before selecting samples) and change ASV identifiers using ASV_#_Genus.

```{r}
taxa
colnames(taxa)
head(taxa$FEATURE_ID,25)
head(taxa$taxonomy,25)
taxa$ASV_Genus <- listnumber2
head(taxa$ASV_Genus,25)
head(row.names(taxa),20)
row.names(taxa) <- taxa$ASV_Genus
colnames(taxa)
taxa <- taxa[,-1]
colnames(taxa)
taxa <- taxa[,-124]
taxa <- taxa[,-124]
colnames(taxa)
taxa[1:10,1:10]

#Change column names of taxa based on row names of metadata. Distinctions between Hyphen or Dash, and Underscores! 
colnames(taxa)
dim(taxa)
rownames(metadata)
dim(metadata)

colnames(taxa) <- rownames(metadata)
colnames(taxa)

#the identical() function in R does check both the values and their order.
identical(colnames(taxa), rownames(metadata))
```


## 6. <u>Sample selection for heatmap construction</u>

To construct the heatmap, we select metadataarticle3 including the metabolic phases: Elong_H2CO2_Auto, Homoacetogenesis_Auto, Homoacetogenesis_Mixo, Elong_H2CO2_Mixo, Elong_EtOH&Lactate, and Acidogenesis.

If methanogenesis included, make reference to metadataarticle4.

```{r}
columns_to_keep <- rownames(metadataarticle3)
columns_to_keep

columns_to_keep2 <- rownames(metadataarticle4)
columns_to_keep2

# Subset the dataframe to include only the specified columns
taxa_subset <- taxa[, columns_to_keep]
taxa_subset
dim(taxa_subset)

taxa_subset2 <- taxa[, columns_to_keep2]
taxa_subset2
dim(taxa_subset2)
taxa_subset2[1:20,1:20]

head(rowSums(taxa),20)
head(rowSums(taxa_subset),20)
head(rowSums(taxa_subset2),20)

length(rowSums(taxa))
length(rowSums(taxa_subset))
length(rowSums(taxa_subset2))
```

In taxa file, select ASVs with a relative abundance of 0 in all samples.

```{r}
asvs_zero_sum <- rownames(taxa)[rowSums(taxa) == 0]
length(asvs_zero_sum)

asvs_zero_sum2 <- rownames(taxa_subset)[rowSums(taxa_subset) == 0]
length(asvs_zero_sum2)
asvs_zero_sum2

asvs_zero_sum3 <- rownames(taxa_subset2)[rowSums(taxa_subset2) == 0]
length(asvs_zero_sum3)
```

In taxa file, select ASVs with a relative abundance greather than 0 in all samples.
635 ASVs detected in Elong_H2CO2_Auto, Homoacetogenesis_Auto, Homoacetogenesis_Mixo, Elong_H2CO2_Mixo, Elong_EtOH&Lactate, and Acidogenesis.

```{r}
asvs_zero_sum4 <- rownames(taxa_subset)[rowSums(taxa_subset) != 0]
length(asvs_zero_sum4)
asvs_zero_sum4
```

Select ASVs greater than zero in taxa_subset

```{r}
taxa_subset
asvs_zero_sum4
class(taxa_subset)
class(asvs_zero_sum4)

selected_rows <- taxa_subset[rownames(taxa_subset) %in% asvs_zero_sum4, ]
selected_rows
dim(selected_rows)

summary(selected_rows)

asv_counts_per_sample <- apply(selected_rows, 2, function(x) sum(x > 0))
asv_counts_per_sample
```

If you want to determine how many times an ASV is detected in selected samples, you can use the R expression function(x) sum(x > 0). This defines an anonymous function that counts the number of elements in x that are greater than 0.

```{r}
asv_detection_counts <- apply(selected_rows, 1, function(x) sum(x > 0))
asv_detection_counts
asv_detection_counts_sorted <- sort(asv_detection_counts, decreasing = TRUE)
asv_detection_counts_sorted
```

## 7. <u>Annotations and color palettes for heatmap construction</u>

```{r}
#Define annotations
df <- metadataarticle3[,c("Main_Metabolic_Phase","Metabolism_Carbon","Reactor")]
colnames(df)
df
class(df)

df2 <- metadataarticle4[,c("Main_Metabolic_Phase","Metabolism_Carbon","Reactor")]
colnames(df2)

#Color palette
mypal5 <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100)
mypal5

length(mypal5)

# Visualize the palette as a horizontal strip
image(1:100, 1, as.matrix(1:100), col = mypal5, axes = FALSE, xlab = "", ylab = "")

annotation_colors <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571"  )
)

annotation_colors2 <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571",
    Methanogenesis = "blue")
)

annotation_colors3 <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571"  ),
  Phylum = c(
  "p__Actinobacteriota"             = "#ff0000",
  "p__Bacteroidota"                 = "#000000",
  "p__Firmicutes"                   = "#ffff00")
)

```

To add row annotations in the heatmap.

```{r}
asv_annotation <- asv_table2[, c("ASV_Genus", "Phylum")]
rownames(asv_annotation) <- asv_annotation$ASV_Genus
head(asv_annotation)
asv_annotation <- asv_annotation[, "Phylum", drop = FALSE]
head(asv_annotation)
unique(asv_annotation$Phylum)

identical(row.names(asv_annotation),row.names(taxa_subset))

class(asv_annotation)

taxatop25f
list <- row.names(taxatop25f)
list
asv_annotation
rownames(asv_annotation)
dim(asv_annotation)

asv_annotation_subset <- asv_annotation[rownames(asv_annotation) %in% rownames(taxatop25f), , drop = FALSE]
asv_annotation_subset
class(asv_annotation_subset)
```

## <u>8. Select the TOP25/30 taxa in all selected samples</u>

```{r}
taxa_subset
taxa_subsett <- t(taxa_subset)
taxa_subsett_decr <- taxa_subsett[,order(colSums(taxa_subsett),decreasing=TRUE)]
rownames(taxa_subsett_decr)
colnames(taxa_subsett_decr)
top25list <- colnames(taxa_subsett_decr)[1:25]
top25list
taxatop25 <- data.frame(taxa_subsett_decr[,colnames(taxa_subsett_decr) %in% top25list], check.names = FALSE)
taxatop25
taxatop25f <- t(taxatop25)
taxatop25f
colnames(taxatop25f)
colSums(taxatop25f)
rowSums(taxatop25f)
write.csv(taxatop25f, "taxatop25f.csv", row.names = TRUE)

w <- taxatop25f+1
write.csv(w, "taxatop25fwithone.csv", row.names = TRUE)

taxa_subset2
taxa_subset2t <- t(taxa_subset2)
taxa_subset2t_decr <- taxa_subset2t[,order(colSums(taxa_subset2t),decreasing=TRUE)]
rownames(taxa_subset2t_decr)
colnames(taxa_subset2t_decr)
top30listm <- colnames(taxa_subset2t_decr)[1:30]
top30listm
taxatop30m <- data.frame(taxa_subset2t_decr[,colnames(taxa_subset2t_decr) %in% top30listm], check.names = FALSE)
taxatop30m
taxatop30mf <- t(taxatop30m)
taxatop30mf
colSums(taxatop30mf)
```


# <u>9. Data transformation and heatmap construction</u>

Add 1 to your rarefied counts before log transformation:
log(x + 1) is a standard preprocessing step.

```{r}
taxatop25nozero <- taxatop25f+1
write.csv(taxatop25nozero, "taxatop25fnozero_2.csv", row.names = TRUE)

range(taxatop25nozero, na.rm = TRUE)
```

Euclidean distances are used by default in pheatmap, for both rows and columns (clustering_distance_rows = "euclidean"; clustering_distance_cols = "euclidean")

## 9.1. Log2 transformation

LOG2(1) = 0
LOG2(0) undefined

```{r}
datlog2 <- log2(taxatop25f+1)

range(datlog2, na.rm = TRUE)
summary(as.vector(datlog2))

str(datlog2)
is.matrix(datlog2)
is.numeric(datlog2)
```

This command uses the pheatmap package in R to generate a heatmap and stores the result (including dendrogram information) in the object plog2_average. pheatmap() uses here average linkage (UPGMA) for hierarchical clustering of rows and columns. cutree_cols = 7 cuts the column dendrogram into 7 clusters. This means: Columns (samples) will be grouped into 7 clusters, and these clusters are shown with colored bars based on defined annotations. cutree_rows = 10 cuts the row dendrogram into 10 clusters. This groups the taxa (rows) into 10 clusters, and helps visually identify groups of similar features.

```{r}
tiff("plog2_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(datlog2, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```

```{r}
tiff("plog2_average_2_rows.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(datlog2, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_row=asv_annotation_subset, annotation_colors = annotation_colors3, cutree_cols = 7, cutree_rows = 10)
dev.off()
```


Define scale of color palette.

```{r}
my_breaks <- seq(0, 10, by = 0.5)
mypal6 <- colorRampPalette(RColorBrewer::brewer.pal(9, "YlGnBu"))(length(my_breaks) - 1)
mypal6 <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(20)
mypal6

length(mypal5)
length(mypal6)

str(my_breaks)
is.numeric(my_breaks)             
all(is.finite(my_breaks))          
length(mypal5) == length(my_breaks) - 1
print(length(mypal5))

tiff("plog2_average_2_scalecolor.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(mat = datlog2, color = mypal6, clustering_method="average", breaks = my_breaks, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```

## 9.2. Log10 transformation

```{r}
datlog10 <- log10(taxatop25f+1)

range(datlog10, na.rm = TRUE)
summary(as.vector(datlog10))

str(datlog10)
is.matrix(datlog10)
is.numeric(datlog10)
```

```{r}
tiff("plog10_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
plog10_average <- pheatmap(datlog10, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```

## 9.3. clr transformation

clr (Centered Log-Ratio) transformation from the package compositions. clr uses the geometric mean of all components (in microbiome data, each component in a compositional vector corresponds to a specific ASV or OTU).

```{r}
taxatop25f
row.names(taxatop25f)
colnames(taxatop25f)

taxatop25f2 <- taxatop25f+1
write.csv(taxatop25f2, "taxatop25fplusone.csv", row.names = TRUE)

datclr <- clr(acomp(taxatop25f+1))

range(datclr, na.rm = TRUE)
summary(as.vector(datclr))

class(datclr)
is.matrix(datclr)
is.numeric(datclr)

datclr_mat <- as.matrix(datclr)
datclr_mat
write.csv(datclr_mat, "datclr_mat.csv", row.names = TRUE)
```

```{r}
tiff("pclr_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
pclr_average <- pheatmap(datclr, mypal5, clustering_method="complete", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```

```{r}
tiff("pclr_average_2_mat.tiff", width = 16, height = 8, units = "in", res = 300)
pclr_average <- pheatmap(datclr_mat, mypal5, clustering_method="complete", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```
