# ==== EDIT THESE THREE LINES ====
filename <- "ClassProject_votus_6samples_coverm.tsv"  # Excel file name
tpm_threshold <- 10                                  # keep vOTUs with max TPM > this
heatmap_colors <- c("#440154", "#31688e", "#35b779", "#fde725")

# =================================
library(readr)
library(readxl)
library(pheatmap)

# 1. Read the Excel file
cov <- read_tsv(filename)
## Rows: 72 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): Contig
## dbl (18): sample1_jbs172_sorted Mean, sample1_jbs172_sorted RPKM, sample1_jb...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 2. Keep Contig and TPM columns only
tpm_cols <- grepl("TPM$", names(cov))
cov_tpm <- cov[ , c("Contig", names(cov)[tpm_cols])]

# Remove S1k141_26921_full
cov_tpm <- subset(cov_tpm, Contig != "S1k141_26921||full")

# 3. Optional filter: drop very low-abundance vOTUs
cov_tpm$max_tpm <- apply(cov_tpm[ , -1], 1, max, na.rm = TRUE)
cov_tpm <- subset(cov_tpm, max_tpm > tpm_threshold)
cov_tpm$max_tpm <- NULL

# If everything got filtered out (threshold too high), warn and stop
if (nrow(cov_tpm) == 0) {
  stop("No vOTUs passed the TPM threshold. Try lowering tpm_threshold.")
}

# 4. Make matrix for heatmap (rows = vOTUs, cols = samples)
mat <- as.matrix(cov_tpm[ , -1])
rownames(mat) <- cov_tpm$Contig

# 5. Log-transform for nicer color scaling
mat_log <- log10(mat + 1)

# 6. Draw heat map
pheatmap(mat_log,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         scale = "none",
         color = colorRampPalette(heatmap_colors)(100),
         fontsize_row = 8,
         fontsize_col = 10,
         main = "vOTU relative abundance (log10 TPM + 1)")

## Each session:
library(vegan)
## Loading required package: permute
library(pheatmap)
library(RColorBrewer)
library(readr)

## 1. Read your exact file
abund_raw <- read_tsv(
  "ClassProject_votus_6samples_coverm.tsv"
)
## Rows: 72 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): Contig
## dbl (18): sample1_jbs172_sorted Mean, sample1_jbs172_sorted RPKM, sample1_jb...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## 2. First column is vOTU / Contig ID. Make that the rownames.
rownames(abund_raw) <- abund_raw$Contig
## Warning: Setting row names on a tibble is deprecated.
## 3. Drop the Contig column so we have a numeric matrix only
abund <- abund_raw[, -1]

## 4. Quick sanity check
dim(abund)
## [1] 72 18
head(rownames(abund))   # vOTU IDs
## [1] "1" "2" "3" "4" "5" "6"
colnames(abund)         # sample1_jbs172_sorted TPM, etc.
##  [1] "sample1_jbs172_sorted Mean" "sample1_jbs172_sorted RPKM"
##  [3] "sample1_jbs172_sorted TPM"  "sample2_mjd356_sorted Mean"
##  [5] "sample2_mjd356_sorted RPKM" "sample2_mjd356_sorted TPM" 
##  [7] "sample3_mrk143_sorted Mean" "sample3_mrk143_sorted RPKM"
##  [9] "sample3_mrk143_sorted TPM"  "sample6_mmw162_sorted Mean"
## [11] "sample6_mmw162_sorted RPKM" "sample6_mmw162_sorted TPM" 
## [13] "sample7_pj288_sorted Mean"  "sample7_pj288_sorted RPKM" 
## [15] "sample7_pj288_sorted TPM"   "SRR37587572_sorted Mean"   
## [17] "SRR37587572_sorted RPKM"    "SRR37587572_sorted TPM"
##per sample richness and diversity
## Transpose: now rows = samples, columns = vOTUs
abund_t <- t(abund)

## Optional: drop samples with total abundance = 0
abund_t <- abund_t[rowSums(abund_t) > 0, , drop = FALSE]

## 1. Species richness per sample (number of vOTUs with >0)
richness <- specnumber(abund_t)   # named vector, names are sample IDs

## 2. Shannon diversity per sample
shannon <- diversity(abund_t, index = "shannon")
abund <- abund_raw[, -1]

# Force numeric
abund <- as.data.frame(lapply(abund, as.numeric))
sapply(abund, is.numeric)
## sample1_jbs172_sorted.Mean sample1_jbs172_sorted.RPKM 
##                       TRUE                       TRUE 
##  sample1_jbs172_sorted.TPM sample2_mjd356_sorted.Mean 
##                       TRUE                       TRUE 
## sample2_mjd356_sorted.RPKM  sample2_mjd356_sorted.TPM 
##                       TRUE                       TRUE 
## sample3_mrk143_sorted.Mean sample3_mrk143_sorted.RPKM 
##                       TRUE                       TRUE 
##  sample3_mrk143_sorted.TPM sample6_mmw162_sorted.Mean 
##                       TRUE                       TRUE 
## sample6_mmw162_sorted.RPKM  sample6_mmw162_sorted.TPM 
##                       TRUE                       TRUE 
##  sample7_pj288_sorted.Mean  sample7_pj288_sorted.RPKM 
##                       TRUE                       TRUE 
##   sample7_pj288_sorted.TPM    SRR37587572_sorted.Mean 
##                       TRUE                       TRUE 
##    SRR37587572_sorted.RPKM     SRR37587572_sorted.TPM 
##                       TRUE                       TRUE
## 3. Combine into a data.frame
alpha_div <- data.frame(
  Sample   = rownames(abund_t),
  Richness = richness,
  Shannon  = shannon
)

alpha_div
##                                                Sample Richness    Shannon
## sample1_jbs172_sorted Mean sample1_jbs172_sorted Mean       17 0.07833997
## sample1_jbs172_sorted RPKM sample1_jbs172_sorted RPKM       17 0.08150328
## sample1_jbs172_sorted TPM   sample1_jbs172_sorted TPM       17 0.08150325
## sample2_mjd356_sorted Mean sample2_mjd356_sorted Mean       15 0.05610716
## sample2_mjd356_sorted RPKM sample2_mjd356_sorted RPKM       15 0.05830023
## sample2_mjd356_sorted TPM   sample2_mjd356_sorted TPM       15 0.05830022
## sample3_mrk143_sorted Mean sample3_mrk143_sorted Mean       47 0.39097504
## sample3_mrk143_sorted RPKM sample3_mrk143_sorted RPKM       48 0.41819529
## sample3_mrk143_sorted TPM   sample3_mrk143_sorted TPM       48 0.41819532
## sample6_mmw162_sorted Mean sample6_mmw162_sorted Mean       27 1.67629073
## sample6_mmw162_sorted RPKM sample6_mmw162_sorted RPKM       27 1.77657424
## sample6_mmw162_sorted TPM   sample6_mmw162_sorted TPM       27 1.77657421
## sample7_pj288_sorted Mean   sample7_pj288_sorted Mean       31 2.14928297
## sample7_pj288_sorted RPKM   sample7_pj288_sorted RPKM       31 2.20659071
## sample7_pj288_sorted TPM     sample7_pj288_sorted TPM       31 2.20659070
## Barplot of richness PER SAMPLE
barplot(
  alpha_div$Richness,
  names.arg = alpha_div$Sample,   # force sample IDs on x-axis
  las = 2,
  ylab = "vOTU richness (count)",
  main = "Per-sample viral richness"
)

## Barplot of Shannon PER SAMPLE
barplot(
  alpha_div$Shannon,
  names.arg = alpha_div$Sample,
  las = 2,
  ylab = "Shannon diversity",
  main = "Per-sample viral diversity"
)