3. Read the abundance table (taxa)
The abundance file contains the ASV numbers as rows and sample names
as columns. The abundance table contains 123 samples and 1866 ASVs.
taxa <- read.table(file = "feature_table_final_rarefied.tsv", sep = "\t", header = TRUE, check.names=F)
taxa
head(taxa[, 125], 20)
colnames(taxa)
head(rownames(taxa), 20)
length(rownames(taxa))
dim(taxa)
4. Read the taxonomy table (taxonomic classification)
The taxonomy file contains the ASV identifiers as rows and taxonomic
ranks as columns, from Kingdom to Genus. The ASV_table file was
extracted from the rarefied abundance table.
taxanames <- read.table(file = "ASV_table.csv", sep = ",", header = TRUE, check.names=F)
asv_table <- taxanames
asv_table
colnames(asv_table)
df <- separate(asv_table, taxonomy, into = c("Kingdom", "Phylum", "Class", "Order","Family","Genus"), sep = ";", convert = TRUE)
colnames(df)
df[1:50,1:7]
class(df)
fill_value <- "NA"
df$Family[df$Family == ""] <- fill_value
df$Kingdom[df$Kingdom == ""] <- fill_value
df$Phylum[df$Phylum == ""] <- fill_value
df$Class[df$Class == ""] <- fill_value
df$Order[df$Order == ""] <- fill_value
df$Genus[df$Genus == ""] <- fill_value
df[1:50,1:7]
df$Genus <- df$Genus %>% replace_na('NA')
df$Family <- df$Family %>% replace_na('NA')
df$Class <- df$Class %>% replace_na('NA')
df$Order <- df$Order %>% replace_na('NA')
df$Phylum <- df$Phylum %>% replace_na('NA')
df$Kingdom <- df$Kingdom %>% replace_na('NA')
df[1:50,1:7]
old_char <- "NA"
new_char <- "Unclassified"
df_replaced <- data.frame(lapply(df, function(x) gsub(old_char, new_char, x)))
df_replaced
df_replaced[1:50,1:7]
asv_table2 <- df_replaced
asv_table2
Create “ASV_#” and “ASV_#_Genus” identifiers
listnumber <- asv_table$FEATURE_ID
head(listnumber,20)
#Pay attention: This line defines a function called modify_vector_elements that takes two arguments: vec (the input vector) and ASV (the Additional String Vector).
#ASV_Additional String Vector
modify_vector_elements <- function(vec, ASV) {
modified_vec <- paste0(ASV, vec)
return(modified_vec)
}
FEATURE_ID2 <- modify_vector_elements(listnumber, "ASV_")
head(FEATURE_ID2,20)
length(FEATURE_ID2)
length(listnumber)
row.names(asv_table2) <- FEATURE_ID2
asv_table2
asv_table2 <- asv_table2[,-1]
colnames(asv_table2)
asv_table2$ASV <- rownames(asv_table2)
colnames(asv_table2)
asv_table2$ASV_Genus <- paste(asv_table2$ASV, asv_table2$Genus, sep = "_")
colnames(asv_table2)
head(asv_table2$ASV_Genus,20)
asv_table2$ASV_Genus <- gsub("g__", "", asv_table2$ASV_Genus)
colnames(asv_table2)
head(asv_table2$ASV_Genus,20)
listnumber2 <- asv_table2$ASV_Genus
head(listnumber2,20)
5. Object summary and characteristics
From the summary of abundance table, the value of 0.5359 corresponds
to the mean, i.e., 1000 counts / 1866 detected ASVs.
is.object(metadataarticle2)
is.object(taxa)
is.object(asv_table2)
class(metadataarticle2)
class(taxa)
class(asv_table2)
summary(metadataarticle2)
head(row.names(metadataarticle2),20)
head(colnames(metadataarticle2),20)
dim(taxa)
summary(taxa)
length(unique(taxa$FEATURE_ID))
max(taxa$FEATURE_ID)
min(taxa$FEATURE_ID)
head(row.names(taxa),20)
head(colnames(taxa),20)
summary(asv_table2)
head(row.names(asv_table2),20)
head(colnames(asv_table2),20)
Change column names in taxa to match row names of metadata (before
selecting samples) and change ASV identifiers using ASV_#_Genus.
taxa
colnames(taxa)
head(taxa$FEATURE_ID,25)
head(taxa$taxonomy,25)
taxa$ASV_Genus <- listnumber2
head(taxa$ASV_Genus,25)
head(row.names(taxa),20)
row.names(taxa) <- taxa$ASV_Genus
colnames(taxa)
taxa <- taxa[,-1]
colnames(taxa)
taxa <- taxa[,-124]
taxa <- taxa[,-124]
colnames(taxa)
taxa[1:10,1:10]
#Change column names of taxa based on row names of metadata. Distinctions between Hyphen or Dash, and Underscores!
colnames(taxa)
dim(taxa)
rownames(metadata)
dim(metadata)
colnames(taxa) <- rownames(metadata)
colnames(taxa)
#the identical() function in R does check both the values and their order.
identical(colnames(taxa), rownames(metadata))
6. Sample selection for heatmap construction
To construct the heatmap, we select metadataarticle3 including the
metabolic phases: Elong_H2CO2_Auto, Homoacetogenesis_Auto,
Homoacetogenesis_Mixo, Elong_H2CO2_Mixo, Elong_EtOH&Lactate, and
Acidogenesis.
If methanogenesis included, make reference to metadataarticle4.
columns_to_keep <- rownames(metadataarticle3)
columns_to_keep
columns_to_keep2 <- rownames(metadataarticle4)
columns_to_keep2
# Subset the dataframe to include only the specified columns
taxa_subset <- taxa[, columns_to_keep]
taxa_subset
dim(taxa_subset)
taxa_subset2 <- taxa[, columns_to_keep2]
taxa_subset2
dim(taxa_subset2)
taxa_subset2[1:20,1:20]
head(rowSums(taxa),20)
head(rowSums(taxa_subset),20)
head(rowSums(taxa_subset2),20)
length(rowSums(taxa))
length(rowSums(taxa_subset))
length(rowSums(taxa_subset2))
In taxa file, select ASVs with a relative abundance of 0 in all
samples.
asvs_zero_sum <- rownames(taxa)[rowSums(taxa) == 0]
length(asvs_zero_sum)
asvs_zero_sum2 <- rownames(taxa_subset)[rowSums(taxa_subset) == 0]
length(asvs_zero_sum2)
asvs_zero_sum2
asvs_zero_sum3 <- rownames(taxa_subset2)[rowSums(taxa_subset2) == 0]
length(asvs_zero_sum3)
In taxa file, select ASVs with a relative abundance greather than 0
in all samples. 635 ASVs detected in Elong_H2CO2_Auto,
Homoacetogenesis_Auto, Homoacetogenesis_Mixo, Elong_H2CO2_Mixo,
Elong_EtOH&Lactate, and Acidogenesis.
asvs_zero_sum4 <- rownames(taxa_subset)[rowSums(taxa_subset) != 0]
length(asvs_zero_sum4)
asvs_zero_sum4
Select ASVs greater than zero in taxa_subset
taxa_subset
asvs_zero_sum4
class(taxa_subset)
class(asvs_zero_sum4)
selected_rows <- taxa_subset[rownames(taxa_subset) %in% asvs_zero_sum4, ]
selected_rows
dim(selected_rows)
summary(selected_rows)
asv_counts_per_sample <- apply(selected_rows, 2, function(x) sum(x > 0))
asv_counts_per_sample
If you want to determine how many times an ASV is detected in
selected samples, you can use the R expression function(x) sum(x >
0). This defines an anonymous function that counts the number of
elements in x that are greater than 0.
asv_detection_counts <- apply(selected_rows, 1, function(x) sum(x > 0))
asv_detection_counts
asv_detection_counts_sorted <- sort(asv_detection_counts, decreasing = TRUE)
asv_detection_counts_sorted
7. Annotations and color palettes for heatmap
construction
#Define annotations
df <- metadataarticle3[,c("Main_Metabolic_Phase","Metabolism_Carbon","Reactor")]
colnames(df)
df
class(df)
df2 <- metadataarticle4[,c("Main_Metabolic_Phase","Metabolism_Carbon","Reactor")]
colnames(df2)
#Color palette
mypal5 <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100)
mypal5
length(mypal5)
# Visualize the palette as a horizontal strip
image(1:100, 1, as.matrix(1:100), col = mypal5, axes = FALSE, xlab = "", ylab = "")
annotation_colors <- list(
Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
Main_Metabolic_Phase = c(
Acidogenesis = "#30d5c8",
`Elong_EtOH&Lactate` = "#ffd700",
`Elong_H2CO2_Auto` = "#ff6cda",
`Elong_H2CO2_Mixo` = "#9a009a",
Homoacetogenesis_Auto = "#008100",
Homoacetogenesis_Mixo = "#93c571" )
)
annotation_colors2 <- list(
Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
Main_Metabolic_Phase = c(
Acidogenesis = "#30d5c8",
`Elong_EtOH&Lactate` = "#ffd700",
`Elong_H2CO2_Auto` = "#ff6cda",
`Elong_H2CO2_Mixo` = "#9a009a",
Homoacetogenesis_Auto = "#008100",
Homoacetogenesis_Mixo = "#93c571",
Methanogenesis = "blue")
)
annotation_colors3 <- list(
Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
Main_Metabolic_Phase = c(
Acidogenesis = "#30d5c8",
`Elong_EtOH&Lactate` = "#ffd700",
`Elong_H2CO2_Auto` = "#ff6cda",
`Elong_H2CO2_Mixo` = "#9a009a",
Homoacetogenesis_Auto = "#008100",
Homoacetogenesis_Mixo = "#93c571" ),
Phylum = c(
"p__Actinobacteriota" = "#ff0000",
"p__Bacteroidota" = "#000000",
"p__Firmicutes" = "#ffff00")
)
To add row annotations in the heatmap.
asv_annotation <- asv_table2[, c("ASV_Genus", "Phylum")]
rownames(asv_annotation) <- asv_annotation$ASV_Genus
head(asv_annotation)
asv_annotation <- asv_annotation[, "Phylum", drop = FALSE]
head(asv_annotation)
unique(asv_annotation$Phylum)
identical(row.names(asv_annotation),row.names(taxa_subset))
class(asv_annotation)
taxatop25f
list <- row.names(taxatop25f)
list
asv_annotation
rownames(asv_annotation)
dim(asv_annotation)
asv_annotation_subset <- asv_annotation[rownames(asv_annotation) %in% rownames(taxatop25f), , drop = FALSE]
asv_annotation_subset
class(asv_annotation_subset)
8. Select the TOP25/30 taxa in all selected samples
taxa_subset
taxa_subsett <- t(taxa_subset)
taxa_subsett_decr <- taxa_subsett[,order(colSums(taxa_subsett),decreasing=TRUE)]
rownames(taxa_subsett_decr)
colnames(taxa_subsett_decr)
top25list <- colnames(taxa_subsett_decr)[1:25]
top25list
taxatop25 <- data.frame(taxa_subsett_decr[,colnames(taxa_subsett_decr) %in% top25list], check.names = FALSE)
taxatop25
taxatop25f <- t(taxatop25)
taxatop25f
colnames(taxatop25f)
colSums(taxatop25f)
rowSums(taxatop25f)
write.csv(taxatop25f, "taxatop25f.csv", row.names = TRUE)
w <- taxatop25f+1
write.csv(w, "taxatop25fwithone.csv", row.names = TRUE)
taxa_subset2
taxa_subset2t <- t(taxa_subset2)
taxa_subset2t_decr <- taxa_subset2t[,order(colSums(taxa_subset2t),decreasing=TRUE)]
rownames(taxa_subset2t_decr)
colnames(taxa_subset2t_decr)
top30listm <- colnames(taxa_subset2t_decr)[1:30]
top30listm
taxatop30m <- data.frame(taxa_subset2t_decr[,colnames(taxa_subset2t_decr) %in% top30listm], check.names = FALSE)
taxatop30m
taxatop30mf <- t(taxatop30m)
taxatop30mf
colSums(taxatop30mf)
9. Data transformation and heatmap construction
Add 1 to your rarefied counts before log transformation: log(x + 1)
is a standard preprocessing step.
taxatop25nozero <- taxatop25f+1
write.csv(taxatop25nozero, "taxatop25fnozero_2.csv", row.names = TRUE)
range(taxatop25nozero, na.rm = TRUE)
Euclidean distances are used by default in pheatmap, for both rows
and columns (clustering_distance_rows = “euclidean”;
clustering_distance_cols = “euclidean”)
9.1. Log2 transformation
LOG2(1) = 0 LOG2(0) undefined
datlog2 <- log2(taxatop25f+1)
range(datlog2, na.rm = TRUE)
summary(as.vector(datlog2))
str(datlog2)
is.matrix(datlog2)
is.numeric(datlog2)
This command uses the pheatmap package in R to generate a heatmap and
stores the result (including dendrogram information) in the object
plog2_average. pheatmap() uses here average linkage (UPGMA) for
hierarchical clustering of rows and columns. cutree_cols = 7 cuts the
column dendrogram into 7 clusters. This means: Columns (samples) will be
grouped into 7 clusters, and these clusters are shown with colored bars
based on defined annotations. cutree_rows = 10 cuts the row dendrogram
into 10 clusters. This groups the taxa (rows) into 10 clusters, and
helps visually identify groups of similar features.
tiff("plog2_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(datlog2, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
tiff("plog2_average_2_rows.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(datlog2, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_row=asv_annotation_subset, annotation_colors = annotation_colors3, cutree_cols = 7, cutree_rows = 10)
dev.off()
Define scale of color palette.
my_breaks <- seq(0, 10, by = 0.5)
mypal6 <- colorRampPalette(RColorBrewer::brewer.pal(9, "YlGnBu"))(length(my_breaks) - 1)
mypal6 <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(20)
mypal6
length(mypal5)
length(mypal6)
str(my_breaks)
is.numeric(my_breaks)
all(is.finite(my_breaks))
length(mypal5) == length(my_breaks) - 1
print(length(mypal5))
tiff("plog2_average_2_scalecolor.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(mat = datlog2, color = mypal6, clustering_method="average", breaks = my_breaks, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
9.3. clr transformation
clr (Centered Log-Ratio) transformation from the package
compositions. clr uses the geometric mean of all components (in
microbiome data, each component in a compositional vector corresponds to
a specific ASV or OTU).
taxatop25f
row.names(taxatop25f)
colnames(taxatop25f)
taxatop25f2 <- taxatop25f+1
write.csv(taxatop25f2, "taxatop25fplusone.csv", row.names = TRUE)
datclr <- clr(acomp(taxatop25f+1))
range(datclr, na.rm = TRUE)
summary(as.vector(datclr))
class(datclr)
is.matrix(datclr)
is.numeric(datclr)
datclr_mat <- as.matrix(datclr)
datclr_mat
write.csv(datclr_mat, "datclr_mat.csv", row.names = TRUE)
tiff("pclr_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
pclr_average <- pheatmap(datclr, mypal5, clustering_method="complete", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
tiff("pclr_average_2_mat.tiff", width = 16, height = 8, units = "in", res = 300)
pclr_average <- pheatmap(datclr_mat, mypal5, clustering_method="complete", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
---
title: 'Microbiome analysis: clustered heatmaps'
author: "B. Stenuit, UCLouvain, Earth & Life Institute, Louvain-la-Neuve"
date: "`r Sys.Date()`"
output: html_notebook
---

[View source on GitHub](https://github.com/BenStenuit/Heatmap_Chain_elong)


# 1. <u>Introduction</u>

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. The main package used to generate the heatmap is 'pheatmap' in RStudio. Use updateR() if needed from the 'installr' package.

```{r}
version
getRversion()
options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("installr")
library(installr)
install.packages("rstudioapi")
library(rstudioapi)
rstudio_version <- rstudioapi::versionInfo()$version
cat("RStudio version used:", as.character(rstudio_version), "\n")
```

## 1.1. Load the necessary packages

```{r}
install.packages("pheatmap")
library(pheatmap)
install.packages("extrafont")
library(extrafont)
font_import(pattern="[A/a]rial")
install.packages("tidyverse") #include ggplot2, tidyr and dplyr
library(tidyverse)
install.packages("RColorBrewer")
library(RColorBrewer)
install.packages("compositions")
library(compositions)
```

## 1.2. References

```{r}
citation()
version$version.string
citation("pheatmap")
packageVersion("pheatmap")
#https://cran.r-project.org/web/packages/pheatmap/
```

## 1.3. Set and get working directory

By default, the working directory for R code chunks is the directory that contains the Rmd document.

```{r}
getwd()
```


# 2. <u>Metadata</u>

The metadata file contains the sample names as rows and sample features as columns.

## 2.1. Load the metadata file

Rename function from dplyr package.

```{r}
metadata <- read.csv("GH22_combined_Metadata_withsimple_Finetuned_Productivity4.csv", check.names = FALSE)
colnames(metadata)
rownames(metadata)
metadata
metadata <- metadata %>% rename(Sample_Name = Name)
metadata <- metadata %>% rename(Metabolism = Fermentation_phase_finetuned)
colnames(metadata)
listmet <- metadata$Metabolism
listmet
listmetunique <- unique(listmet)
listmetunique
```

Specify row names and select samples if required.
For the article "", we select samples from reactors #1 and #2, BSG samples and inoculum samples (102 samples, without R3 and R4 reactors).

```{r}
row.names(metadata) <- metadata$Sample_Name
metadataarticle <- metadata %>% filter(grepl('R2|R1|/', Reactor))
metadataarticle[1:10,1:10]
listarticle <- metadataarticle$Sample_Name
listarticle
listmetabolism <- metadataarticle$Metabolism
listmetabolism
unique(listmetabolism)
colnames(metadataarticle)
```

Add metabolic stages for homoacetogenesis and Elong_H2CO2 (auto. vs. mixo.) in a new column "Main_Metabolic_Phase"

```{r}
metadataarticle$Main_Metabolic_Phase <- listmetabolism
metadataarticle$Main_Metabolic_Phase
colnames(metadataarticle)

old_name <- "Homoacetogenesis"
# Specify the new name
new_name <- "Homoacetogenesis_Mixo"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Homoacetogenesis"
# Specify the new name
new_name <- "Homoacetogenesis_Auto"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Elong_H2CO2"
# Specify the new name
new_name <- "Elong_H2CO2_Mixo"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Mixotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase

old_name <- "Elong_H2CO2"
# Specify the new name
new_name <- "Elong_H2CO2_Auto"

if (any(metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic")) {
  metadataarticle$Main_Metabolic_Phase[metadataarticle$Main_Metabolic_Phase == old_name & metadataarticle$Metabolism_Carbon == "Autotrophic"] <- new_name
}

metadataarticle$Main_Metabolic_Phase
unique(metadataarticle$Main_Metabolic_Phase)
unique(listmetabolism)
```

## 2.2. Metadata file cleaning

```{r}
colnames(metadataarticle)
metadataarticle2 <- metadataarticle[, !colnames(metadataarticle) %in% c("Sample","Reverse_primer","Barcode_i7","Index i7 Leuven","Experiment","Replicate","Name","Sample_ID")]
colnames(metadataarticle2)
metadataarticle2[1:10,1:10]
metadataarticle2$Fermentation_phase_simplified
metadataarticle2$Metabolism
unique(metadataarticle2$Fermentation_phase_simplified)
unique(metadataarticle2$Metabolism)
unique(metadataarticle2$Metabolism_Carbon)
unique(metadataarticle2$Main_Metabolic_Phase)
```

## 2.3. Metadata file subsetting

Without lag phases, and the inoculum/BSG samples

```{r}
metadataarticle3 <- metadataarticle2 %>% filter(grepl('Elong_H2CO2_Auto|Homoacetogenesis_Auto|Homoacetogenesis_Mixo|Elong_H2CO2_Mixo|Elong_EtOH&Lactate|Acidogenesis', Main_Metabolic_Phase))
rownames(metadataarticle3)

#With Methanogens
metadataarticle4 <- metadataarticle2 %>% filter(grepl('Elong_H2CO2_Auto|Homoacetogenesis_Auto|Homoacetogenesis_Mixo|Elong_H2CO2_Mixo|Elong_EtOH&Lactate|Acidogenesis|Methanogenesis', Main_Metabolic_Phase))
rownames(metadataarticle4)
```


# 3. <u>Read the abundance table (taxa)</u>

The abundance file contains the ASV numbers as rows and sample names as columns.
The abundance table contains 123 samples and 1866 ASVs.

```{r}
taxa <- read.table(file = "feature_table_final_rarefied.tsv", sep = "\t", header = TRUE, check.names=F)
taxa
head(taxa[, 125], 20)
colnames(taxa)
head(rownames(taxa), 20)
length(rownames(taxa))
dim(taxa)
```


## 4. <u>Read the taxonomy table (taxonomic classification)</u>

The taxonomy file contains the ASV identifiers as rows and taxonomic ranks as columns, from Kingdom to Genus. The ASV_table file was extracted from the rarefied abundance table.

```{r}
taxanames <- read.table(file = "ASV_table.csv", sep = ",", header = TRUE, check.names=F)
asv_table <- taxanames
asv_table
colnames(asv_table)
df <- separate(asv_table, taxonomy, into = c("Kingdom", "Phylum", "Class", "Order","Family","Genus"), sep = ";", convert = TRUE)
colnames(df)
df[1:50,1:7]
class(df)
fill_value <- "NA"
df$Family[df$Family == ""] <- fill_value
df$Kingdom[df$Kingdom == ""] <- fill_value
df$Phylum[df$Phylum == ""] <- fill_value
df$Class[df$Class == ""] <- fill_value
df$Order[df$Order == ""] <- fill_value
df$Genus[df$Genus == ""] <- fill_value
df[1:50,1:7]
df$Genus <- df$Genus %>% replace_na('NA')
df$Family <- df$Family %>% replace_na('NA')
df$Class <- df$Class %>% replace_na('NA')
df$Order <- df$Order %>% replace_na('NA')
df$Phylum <- df$Phylum %>% replace_na('NA')
df$Kingdom <- df$Kingdom %>% replace_na('NA')
df[1:50,1:7]
old_char <- "NA"
new_char <- "Unclassified"
df_replaced <- data.frame(lapply(df, function(x) gsub(old_char, new_char, x)))
df_replaced
df_replaced[1:50,1:7]
asv_table2 <- df_replaced
asv_table2
```

Create "ASV_#" and "ASV_#_Genus" identifiers

```{r}
listnumber <- asv_table$FEATURE_ID
head(listnumber,20)
#Pay attention: This line defines a function called modify_vector_elements that takes two arguments: vec (the input vector) and ASV (the Additional String Vector).
#ASV_Additional String Vector
modify_vector_elements <- function(vec, ASV) {
  modified_vec <- paste0(ASV, vec)
  return(modified_vec)
}
FEATURE_ID2 <- modify_vector_elements(listnumber, "ASV_")
head(FEATURE_ID2,20)

length(FEATURE_ID2)
length(listnumber)

row.names(asv_table2) <- FEATURE_ID2
asv_table2
asv_table2 <- asv_table2[,-1]
colnames(asv_table2)
asv_table2$ASV <- rownames(asv_table2)
colnames(asv_table2)
asv_table2$ASV_Genus <- paste(asv_table2$ASV, asv_table2$Genus, sep = "_")
colnames(asv_table2)
head(asv_table2$ASV_Genus,20)
asv_table2$ASV_Genus <- gsub("g__", "", asv_table2$ASV_Genus)
colnames(asv_table2)
head(asv_table2$ASV_Genus,20)

listnumber2 <- asv_table2$ASV_Genus
head(listnumber2,20)
```


## 5. <u>Object summary and characteristics</u>

From the summary of abundance table, the value of 0.5359 corresponds to the mean, i.e., 1000 counts / 1866 detected ASVs.

```{r}
is.object(metadataarticle2)
is.object(taxa)
is.object(asv_table2)

class(metadataarticle2)
class(taxa)
class(asv_table2)

summary(metadataarticle2)
head(row.names(metadataarticle2),20)
head(colnames(metadataarticle2),20)
dim(taxa)
summary(taxa)
length(unique(taxa$FEATURE_ID))
max(taxa$FEATURE_ID)
min(taxa$FEATURE_ID)
head(row.names(taxa),20)
head(colnames(taxa),20)
summary(asv_table2)
head(row.names(asv_table2),20)
head(colnames(asv_table2),20)
```

Change column names in taxa to match row names of metadata (before selecting samples) and change ASV identifiers using ASV_#_Genus.

```{r}
taxa
colnames(taxa)
head(taxa$FEATURE_ID,25)
head(taxa$taxonomy,25)
taxa$ASV_Genus <- listnumber2
head(taxa$ASV_Genus,25)
head(row.names(taxa),20)
row.names(taxa) <- taxa$ASV_Genus
colnames(taxa)
taxa <- taxa[,-1]
colnames(taxa)
taxa <- taxa[,-124]
taxa <- taxa[,-124]
colnames(taxa)
taxa[1:10,1:10]

#Change column names of taxa based on row names of metadata. Distinctions between Hyphen or Dash, and Underscores! 
colnames(taxa)
dim(taxa)
rownames(metadata)
dim(metadata)

colnames(taxa) <- rownames(metadata)
colnames(taxa)

#the identical() function in R does check both the values and their order.
identical(colnames(taxa), rownames(metadata))
```


## 6. <u>Sample selection for heatmap construction</u>

To construct the heatmap, we select metadataarticle3 including the metabolic phases: Elong_H2CO2_Auto, Homoacetogenesis_Auto, Homoacetogenesis_Mixo, Elong_H2CO2_Mixo, Elong_EtOH&Lactate, and Acidogenesis.

If methanogenesis included, make reference to metadataarticle4.

```{r}
columns_to_keep <- rownames(metadataarticle3)
columns_to_keep

columns_to_keep2 <- rownames(metadataarticle4)
columns_to_keep2

# Subset the dataframe to include only the specified columns
taxa_subset <- taxa[, columns_to_keep]
taxa_subset
dim(taxa_subset)

taxa_subset2 <- taxa[, columns_to_keep2]
taxa_subset2
dim(taxa_subset2)
taxa_subset2[1:20,1:20]

head(rowSums(taxa),20)
head(rowSums(taxa_subset),20)
head(rowSums(taxa_subset2),20)

length(rowSums(taxa))
length(rowSums(taxa_subset))
length(rowSums(taxa_subset2))
```

In taxa file, select ASVs with a relative abundance of 0 in all samples.

```{r}
asvs_zero_sum <- rownames(taxa)[rowSums(taxa) == 0]
length(asvs_zero_sum)

asvs_zero_sum2 <- rownames(taxa_subset)[rowSums(taxa_subset) == 0]
length(asvs_zero_sum2)
asvs_zero_sum2

asvs_zero_sum3 <- rownames(taxa_subset2)[rowSums(taxa_subset2) == 0]
length(asvs_zero_sum3)
```

In taxa file, select ASVs with a relative abundance greather than 0 in all samples.
635 ASVs detected in Elong_H2CO2_Auto, Homoacetogenesis_Auto, Homoacetogenesis_Mixo, Elong_H2CO2_Mixo, Elong_EtOH&Lactate, and Acidogenesis.

```{r}
asvs_zero_sum4 <- rownames(taxa_subset)[rowSums(taxa_subset) != 0]
length(asvs_zero_sum4)
asvs_zero_sum4
```

Select ASVs greater than zero in taxa_subset

```{r}
taxa_subset
asvs_zero_sum4
class(taxa_subset)
class(asvs_zero_sum4)

selected_rows <- taxa_subset[rownames(taxa_subset) %in% asvs_zero_sum4, ]
selected_rows
dim(selected_rows)

summary(selected_rows)

asv_counts_per_sample <- apply(selected_rows, 2, function(x) sum(x > 0))
asv_counts_per_sample
```

If you want to determine how many times an ASV is detected in selected samples, you can use the R expression function(x) sum(x > 0). This defines an anonymous function that counts the number of elements in x that are greater than 0.

```{r}
asv_detection_counts <- apply(selected_rows, 1, function(x) sum(x > 0))
asv_detection_counts
asv_detection_counts_sorted <- sort(asv_detection_counts, decreasing = TRUE)
asv_detection_counts_sorted
```

## 7. <u>Annotations and color palettes for heatmap construction</u>

```{r}
#Define annotations
df <- metadataarticle3[,c("Main_Metabolic_Phase","Metabolism_Carbon","Reactor")]
colnames(df)
df
class(df)

df2 <- metadataarticle4[,c("Main_Metabolic_Phase","Metabolism_Carbon","Reactor")]
colnames(df2)

#Color palette
mypal5 <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100)
mypal5

length(mypal5)

# Visualize the palette as a horizontal strip
image(1:100, 1, as.matrix(1:100), col = mypal5, axes = FALSE, xlab = "", ylab = "")

annotation_colors <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571"  )
)

annotation_colors2 <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571",
    Methanogenesis = "blue")
)

annotation_colors3 <- list(
  Reactor = c(R1 = "#b1b1b1", R2 = "#7F1734"),
  Metabolism_Carbon = c(Autotrophic = "#1e7dff", Mixotrophic = "#ff7b00"),
  Main_Metabolic_Phase = c(
    Acidogenesis = "#30d5c8", 
    `Elong_EtOH&Lactate` = "#ffd700", 
    `Elong_H2CO2_Auto` = "#ff6cda", 
    `Elong_H2CO2_Mixo` = "#9a009a", 
    Homoacetogenesis_Auto = "#008100", 
    Homoacetogenesis_Mixo = "#93c571"  ),
  Phylum = c(
  "p__Actinobacteriota"             = "#ff0000",
  "p__Bacteroidota"                 = "#000000",
  "p__Firmicutes"                   = "#ffff00")
)

```

To add row annotations in the heatmap.

```{r}
asv_annotation <- asv_table2[, c("ASV_Genus", "Phylum")]
rownames(asv_annotation) <- asv_annotation$ASV_Genus
head(asv_annotation)
asv_annotation <- asv_annotation[, "Phylum", drop = FALSE]
head(asv_annotation)
unique(asv_annotation$Phylum)

identical(row.names(asv_annotation),row.names(taxa_subset))

class(asv_annotation)

taxatop25f
list <- row.names(taxatop25f)
list
asv_annotation
rownames(asv_annotation)
dim(asv_annotation)

asv_annotation_subset <- asv_annotation[rownames(asv_annotation) %in% rownames(taxatop25f), , drop = FALSE]
asv_annotation_subset
class(asv_annotation_subset)
```

## <u>8. Select the TOP25/30 taxa in all selected samples</u>

```{r}
taxa_subset
taxa_subsett <- t(taxa_subset)
taxa_subsett_decr <- taxa_subsett[,order(colSums(taxa_subsett),decreasing=TRUE)]
rownames(taxa_subsett_decr)
colnames(taxa_subsett_decr)
top25list <- colnames(taxa_subsett_decr)[1:25]
top25list
taxatop25 <- data.frame(taxa_subsett_decr[,colnames(taxa_subsett_decr) %in% top25list], check.names = FALSE)
taxatop25
taxatop25f <- t(taxatop25)
taxatop25f
colnames(taxatop25f)
colSums(taxatop25f)
rowSums(taxatop25f)
write.csv(taxatop25f, "taxatop25f.csv", row.names = TRUE)

w <- taxatop25f+1
write.csv(w, "taxatop25fwithone.csv", row.names = TRUE)

taxa_subset2
taxa_subset2t <- t(taxa_subset2)
taxa_subset2t_decr <- taxa_subset2t[,order(colSums(taxa_subset2t),decreasing=TRUE)]
rownames(taxa_subset2t_decr)
colnames(taxa_subset2t_decr)
top30listm <- colnames(taxa_subset2t_decr)[1:30]
top30listm
taxatop30m <- data.frame(taxa_subset2t_decr[,colnames(taxa_subset2t_decr) %in% top30listm], check.names = FALSE)
taxatop30m
taxatop30mf <- t(taxatop30m)
taxatop30mf
colSums(taxatop30mf)
```


# <u>9. Data transformation and heatmap construction</u>

Add 1 to your rarefied counts before log transformation:
log(x + 1) is a standard preprocessing step.

```{r}
taxatop25nozero <- taxatop25f+1
write.csv(taxatop25nozero, "taxatop25fnozero_2.csv", row.names = TRUE)

range(taxatop25nozero, na.rm = TRUE)
```

Euclidean distances are used by default in pheatmap, for both rows and columns (clustering_distance_rows = "euclidean"; clustering_distance_cols = "euclidean")

## 9.1. Log2 transformation

LOG2(1) = 0
LOG2(0) undefined

```{r}
datlog2 <- log2(taxatop25f+1)

range(datlog2, na.rm = TRUE)
summary(as.vector(datlog2))

str(datlog2)
is.matrix(datlog2)
is.numeric(datlog2)
```

This command uses the pheatmap package in R to generate a heatmap and stores the result (including dendrogram information) in the object plog2_average. pheatmap() uses here average linkage (UPGMA) for hierarchical clustering of rows and columns. cutree_cols = 7 cuts the column dendrogram into 7 clusters. This means: Columns (samples) will be grouped into 7 clusters, and these clusters are shown with colored bars based on defined annotations. cutree_rows = 10 cuts the row dendrogram into 10 clusters. This groups the taxa (rows) into 10 clusters, and helps visually identify groups of similar features.

```{r}
tiff("plog2_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(datlog2, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```

```{r}
tiff("plog2_average_2_rows.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(datlog2, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_row=asv_annotation_subset, annotation_colors = annotation_colors3, cutree_cols = 7, cutree_rows = 10)
dev.off()
```


Define scale of color palette.

```{r}
my_breaks <- seq(0, 10, by = 0.5)
mypal6 <- colorRampPalette(RColorBrewer::brewer.pal(9, "YlGnBu"))(length(my_breaks) - 1)
mypal6 <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(20)
mypal6

length(mypal5)
length(mypal6)

str(my_breaks)
is.numeric(my_breaks)             
all(is.finite(my_breaks))          
length(mypal5) == length(my_breaks) - 1
print(length(mypal5))

tiff("plog2_average_2_scalecolor.tiff", width = 16, height = 8, units = "in", res = 300)
plog2_average <- pheatmap(mat = datlog2, color = mypal6, clustering_method="average", breaks = my_breaks, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```

## 9.2. Log10 transformation

```{r}
datlog10 <- log10(taxatop25f+1)

range(datlog10, na.rm = TRUE)
summary(as.vector(datlog10))

str(datlog10)
is.matrix(datlog10)
is.numeric(datlog10)
```

```{r}
tiff("plog10_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
plog10_average <- pheatmap(datlog10, mypal5, clustering_method="average", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```

## 9.3. clr transformation

clr (Centered Log-Ratio) transformation from the package compositions. clr uses the geometric mean of all components (in microbiome data, each component in a compositional vector corresponds to a specific ASV or OTU).

```{r}
taxatop25f
row.names(taxatop25f)
colnames(taxatop25f)

taxatop25f2 <- taxatop25f+1
write.csv(taxatop25f2, "taxatop25fplusone.csv", row.names = TRUE)

datclr <- clr(acomp(taxatop25f+1))

range(datclr, na.rm = TRUE)
summary(as.vector(datclr))

class(datclr)
is.matrix(datclr)
is.numeric(datclr)

datclr_mat <- as.matrix(datclr)
datclr_mat
write.csv(datclr_mat, "datclr_mat.csv", row.names = TRUE)
```

```{r}
tiff("pclr_average_2.tiff", width = 16, height = 8, units = "in", res = 300)
pclr_average <- pheatmap(datclr, mypal5, clustering_method="complete", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```

```{r}
tiff("pclr_average_2_mat.tiff", width = 16, height = 8, units = "in", res = 300)
pclr_average <- pheatmap(datclr_mat, mypal5, clustering_method="complete", breaks = NA, scale = "none", fontsize=10, fontsize_row=8, fontsize_col=8, annotation_col=df, annotation_colors = annotation_colors, cutree_cols = 7, cutree_rows = 10)
dev.off()
```
