Load necessary packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following object is masked from 'package:utils':
## 
##     findMatches
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(EnhancedVolcano)
## Loading required package: ggrepel
setwd("/stor/work/FRI_321G_JD_Spring2026/Spring_Projects/group_Ambystoma/")

readcounts <- read.csv("READCOUNTS/Tur_doh.readcounts.csv", row.names = 1)
protein_key <- read.csv("READCOUNTS/Tur_doh.proteinkey.csv", row.names = 1)
protein_key$protein_id <- rownames(protein_key)
metadata <- read.csv("PRJNA734867/metadata.csv")

head(readcounts)
##                                                      SRR16148333 SRR16148332
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1          93         228
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1          40          43
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      180051      111525
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1          87          96
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         268         204
##                                                      SRR16148319 SRR16148331
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        1556       16837
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         223        2344
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      122741      199160
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           1
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         130         393
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         243         887
##                                                      SRR16148318 SRR16148337
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        1608         148
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         337          30
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      195020      133350
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           2           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         218          92
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         382         155
##                                                      SRR16148341 SRR16148336
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        1028        1697
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         273         159
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      187224       71085
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         304          45
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1        1184          98
##                                                      SRR16148340 SRR16148335
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        4043        1216
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         497         309
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      181245      166210
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         399         179
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         517         215
##                                                      SRR16148343 SRR16148334
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        4404         158
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         860          23
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      190028      170543
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1         128           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         733          76
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         663         244
##                                                      SRR16148342 SRR16148312
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        5181         723
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         663         165
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      186605      386118
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           1
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         179         120
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         294         297
##                                                      SRR16148338 SRR16148310
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        3917        4687
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         305         391
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      104176      273121
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         153         179
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         388         424
##                                                      SRR16148339 SRR16148311
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        2193         232
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         800          74
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      229214      196845
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1         563           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         202         143
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         344         477
##                                                      SRR16148307 SRR16148306
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1         288       22121
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         222        1730
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      304926      150723
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         429         400
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         780         958
##                                                      SRR16148304 SRR16148303
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        5141        2930
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         476         307
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      454420      338876
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         261         184
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         230         202
##                                                      SRR16148302 SRR16148301
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1       10755        1786
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1        1241         265
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      466866      399573
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         465         216
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         582         201
##                                                      SRR16148329 SRR16148300
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1         316        8518
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         123         853
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      150581      355441
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           1           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         129         341
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         328         394
##                                                      SRR16148328 SRR16148326
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1         177        1500
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1          47         383
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      120540      186114
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           1           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1          82         326
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         204         402
##                                                      SRR16148327 SRR16148324
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1         779        1195
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         331         421
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      177913      181055
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         269         267
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         321         413
##                                                      SRR16148325 SRR16148322
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        2459         823
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         670         155
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      278491      221905
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1         168          12
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         215         225
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         528         174
##                                                      SRR16148323 SRR16148320
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        1820        5393
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         435         797
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1      211635      303372
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1         379         190
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         309         571
##                                                      SRR16148308 SRR16148309
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1        2669       11084
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1         222        1503
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1       22020      473989
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1           0           0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1          24         224
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1         102         816
head(protein_key)
##                                                                                                                                                                          Protein_Name
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1                             "Similar_to_RNR2B_Ribonucleoside-diphosphate_reductase_small_chain_B_(Arabidopsis_thaliana_OX=3702)"
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1                                     "Similar_to_Rrm1_Ribonucleoside-diphosphate_reductase_large_subunit_(Mus_musculus_OX=10090)"
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1                                                      "Similar_to_HSPA8_Heat_shock_cognate_71_kDa_protein_(Pongo_abelii_OX=9601)"
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1                                  "Similar_to_IIV6-467R_Uncharacterized_protein_467R_(Invertebrate_iridescent_virus_6_OX=176652)"
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1                                                                "Similar_to_Xrn1_5'-3'_exoribonuclease_1_(Mus_musculus_OX=10090)"
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 "Similar_to_RPO21_DNA-directed_RNA_polymerase_II_subunit_RPB1_(Saccharomyces_cerevisiae_(strain_ATCC_204508_/_S288c)_OX=559292)"
##                                                                                                protein_id
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 augustus_masked-scaffold1-processed-gene-0.13-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 augustus_masked-scaffold1-processed-gene-0.14-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 augustus_masked-scaffold1-processed-gene-0.19-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 augustus_masked-scaffold1-processed-gene-0.28-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 augustus_masked-scaffold1-processed-gene-0.34-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 augustus_masked-scaffold1-processed-gene-0.49-mRNA-1

Gene lengths for proteins:

library(Biostrings)
## Loading required package: XVector
## 
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
## 
##     compact
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
# read fasta
seqs <- readAAStringSet("Reference/dornii.putative_func.proteins.fasta")

# get lengths
lengths <- width(seqs)

# get names (headers)
names <- names(seqs)

# make a dataframe
fasta_df <- data.frame(
  protein_id = names,
  length = lengths,
  stringsAsFactors = FALSE
)
fasta_df$protein_id <- sub(" .*", "", fasta_df$protein_id)
protein_key <- merge(protein_key, fasta_df, by = "protein_id", all.x = TRUE)

PCA Analysis

DE Analysis

colnames(readcounts)
##  [1] "SRR16148333" "SRR16148332" "SRR16148319" "SRR16148331" "SRR16148318"
##  [6] "SRR16148337" "SRR16148341" "SRR16148336" "SRR16148340" "SRR16148335"
## [11] "SRR16148343" "SRR16148334" "SRR16148342" "SRR16148312" "SRR16148338"
## [16] "SRR16148310" "SRR16148339" "SRR16148311" "SRR16148307" "SRR16148306"
## [21] "SRR16148304" "SRR16148303" "SRR16148302" "SRR16148301" "SRR16148329"
## [26] "SRR16148300" "SRR16148328" "SRR16148326" "SRR16148327" "SRR16148324"
## [31] "SRR16148325" "SRR16148322" "SRR16148323" "SRR16148320" "SRR16148308"
## [36] "SRR16148309"
metadata$RunID
##  [1] "SRR16148300" "SRR16148301" "SRR16148302" "SRR16148303" "SRR16148304"
##  [6] "SRR16148305" "SRR16148306" "SRR16148307" "SRR16148308" "SRR16148309"
## [11] "SRR16148310" "SRR16148311" "SRR16148312" "SRR16148313" "SRR16148314"
## [16] "SRR16148315" "SRR16148316" "SRR16148317" "SRR16148318" "SRR16148319"
## [21] "SRR16148320" "SRR16148321" "SRR16148322" "SRR16148323" "SRR16148324"
## [26] "SRR16148325" "SRR16148326" "SRR16148327" "SRR16148328" "SRR16148329"
## [31] "SRR16148330" "SRR16148331" "SRR16148332" "SRR16148333" "SRR16148334"
## [36] "SRR16148335" "SRR16148336" "SRR16148337" "SRR16148338" "SRR16148339"
## [41] "SRR16148340" "SRR16148341" "SRR16148342" "SRR16148343"
# reorder metadata to match count matrix
metadata <- metadata[match(colnames(readcounts), metadata$RunID), ]

# check
all(colnames(readcounts) == metadata$RunID)
## [1] TRUE
subset_idx <- metadata$Stage %in% c("Cyst", "NoReversion")

counts_subset <- readcounts[, subset_idx]
metadata_subset <- metadata[subset_idx, ]
metadata_subset$Stage <- factor(metadata_subset$Stage)
metadata_subset$Stage <- relevel(metadata_subset$Stage, ref = "NoReversion")
library(DESeq2)

dds <- DESeqDataSetFromMatrix(
  countData = counts_subset,
  colData = metadata_subset,
  design = ~ Stage
)
dds <- dds[rowSums(counts(dds)) > 10, ]
vsd <- vst(dds, blind = TRUE)

plotPCA(vsd, intgroup = "Stage")
## using ntop=500 top features by variance
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DESeq2 package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
## [1] "Intercept"                 "Stage_Cyst_vs_NoReversion"
res <- results(dds, contrast = c("Stage", "Cyst", "NoReversion"))
head(res)
## log2 fold change (MLE): Stage Cyst vs NoReversion 
## Wald test p-value: Stage Cyst vs NoReversion 
## DataFrame with 6 rows and 6 columns
##                                                        baseMean log2FoldChange
##                                                       <numeric>      <numeric>
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1   3472.319     -2.5083145
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1    322.211     -1.8176323
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 159604.923      0.3271777
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1    125.629     -0.2876313
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1    310.521     -0.5733679
## augustus_masked-scaffold1-processed-gene-0.9-mRNA-1     336.906     -0.0988602
##                                                          lfcSE      stat
##                                                      <numeric> <numeric>
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1  1.282313 -1.956086
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1  0.821358 -2.212961
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1  0.329998  0.991453
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1  0.379722 -0.757478
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1  0.387414 -1.479986
## augustus_masked-scaffold1-processed-gene-0.9-mRNA-1   0.318229 -0.310657
##                                                         pvalue      padj
##                                                      <numeric> <numeric>
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 0.0504551  0.175170
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 0.0269004  0.113770
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 0.3214644  0.566511
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 0.4487635  0.682256
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 0.1388769  0.338906
## augustus_masked-scaffold1-processed-gene-0.9-mRNA-1  0.7560613  0.885973
res <- res[order(res$padj), ]

# remove NA rows first
res_clean <- res[!is.na(res$padj), ]

# then filter significant
sig <- res_clean[res_clean$padj < 0.05, ]

head(sig)
## log2 fold change (MLE): Stage Cyst vs NoReversion 
## Wald test p-value: Stage Cyst vs NoReversion 
## DataFrame with 6 rows and 6 columns
##                                                         baseMean log2FoldChange
##                                                        <numeric>      <numeric>
## maker-scaffold2631-augustus-gene-0.0-mRNA-1             69082.72        8.48994
## maker-scaffold8088-augustus-gene-0.2-mRNA-1             97746.43        9.81823
## maker-scaffold1293-augustus-gene-0.2-mRNA-1              2826.72        4.31533
## augustus_masked-scaffold1365-processed-gene-0.2-mRNA-1   4450.31       -7.32616
## augustus_masked-scaffold909-processed-gene-0.2-mRNA-1   13443.05       -6.03907
## augustus_masked-scaffold3182-processed-gene-0.3-mRNA-1   2356.72       -5.90921
##                                                            lfcSE      stat
##                                                        <numeric> <numeric>
## maker-scaffold2631-augustus-gene-0.0-mRNA-1             0.382587   22.1909
## maker-scaffold8088-augustus-gene-0.2-mRNA-1             0.540952   18.1499
## maker-scaffold1293-augustus-gene-0.2-mRNA-1             0.256636   16.8150
## augustus_masked-scaffold1365-processed-gene-0.2-mRNA-1  0.489543  -14.9653
## augustus_masked-scaffold909-processed-gene-0.2-mRNA-1   0.408097  -14.7981
## augustus_masked-scaffold3182-processed-gene-0.3-mRNA-1  0.415576  -14.2193
##                                                              pvalue
##                                                           <numeric>
## maker-scaffold2631-augustus-gene-0.0-mRNA-1            4.20722e-109
## maker-scaffold8088-augustus-gene-0.2-mRNA-1             1.28583e-73
## maker-scaffold1293-augustus-gene-0.2-mRNA-1             1.89457e-63
## augustus_masked-scaffold1365-processed-gene-0.2-mRNA-1  1.23763e-50
## augustus_masked-scaffold909-processed-gene-0.2-mRNA-1   1.50619e-49
## augustus_masked-scaffold3182-processed-gene-0.3-mRNA-1  6.95073e-46
##                                                                padj
##                                                           <numeric>
## maker-scaffold2631-augustus-gene-0.0-mRNA-1            7.29364e-105
## maker-scaffold8088-augustus-gene-0.2-mRNA-1             1.11456e-69
## maker-scaffold1293-augustus-gene-0.2-mRNA-1             1.09481e-59
## augustus_masked-scaffold1365-processed-gene-0.2-mRNA-1  5.36390e-47
## augustus_masked-scaffold909-processed-gene-0.2-mRNA-1   5.22228e-46
## augustus_masked-scaffold3182-processed-gene-0.3-mRNA-1  2.00830e-42
DE_results <- as.data.frame(res_clean)

library(EnhancedVolcano)
EnhancedVolcano(DE_results,
    lab = rownames(DE_results),
    x = 'log2FoldChange',
    y = 'padj',
    title = 'Differential Gene Expression: Cyst vs NoReversion',
    subtitle = 'Positive log2FC = upregulated in Cyst; Negative log2FC = upregulated in NoReversion',
    xlab = 'Log2 Fold Change (Cyst / NoReversion)',
    ylab = '-Log10 Adjusted P-value',
    pCutoff = 0.05,
    FCcutoff = 1,
    pointSize = 2.0,
    labSize = 3.0
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the EnhancedVolcano package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the EnhancedVolcano package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

INTERPRETING RESULTS BY LOOKING AT PROTEINS

top 10 differentially expressed for Cyst and NoRev stages

library(stringr)
library(dplyr)

res_clean <- as.data.frame(res_clean)
res_clean$gene_id <- sub("-mRNA-.*$", "", rownames(res_clean))
res_clean$gene_id <- sub("\\.[0-9]+$", "", res_clean$gene_id)
res_cyst <- res_clean %>%
  filter(log2FoldChange > 0, padj < 0.05) %>%
  arrange(desc(log2FoldChange)) %>%
  distinct(gene_id, .keep_all = TRUE)

top10_cyst <- head(res_cyst[order(-res_cyst$log2FoldChange), ], 10)

top10_cyst <- as.data.frame(top10_cyst)

res_norev <- res_clean %>%
  filter(log2FoldChange < 0, padj < 0.05) %>%
  arrange(log2FoldChange) %>%  # most negative first
  distinct(gene_id, .keep_all = TRUE)

top10_norev <- head(res_norev[order(res_norev$log2FoldChange), ], 10)

top10_norev = as.data.frame(top10_norev)
top_genes <- c(rownames(top10_cyst), rownames(top10_norev))
top10_cyst_df <- data.frame(
  protein_id = rownames(top10_cyst),
  group = "cyst"
)

top10_norev_df <- data.frame(
  protein_id = rownames(top10_norev),
  group = "norev"
)

top_genes_df <- rbind(top10_cyst_df, top10_norev_df)

protein_key_top <- protein_key %>%
  inner_join(top_genes_df, by = "protein_id")

protein_key_top_unknown <- protein_key_top %>% filter(Protein_Name == "\"Protein_of_unknown_function\"")

also acquire the top 50 for each so we can filter out uncharacterized proteins

top50_cyst <- head(res_cyst[order(-res_cyst$log2FoldChange), ], 50)
top50_cyst <- as.data.frame(top50_cyst)

top50_norev <- head(res_norev[order(res_norev$log2FoldChange), ], 50)
top50_norev <- as.data.frame(top50_norev)

make_top_df <- function(df, group_name) {
  data.frame(
    protein_id = rownames(df),
    group = group_name
  )
}

top50_genes_df <- rbind(
  make_top_df(top50_cyst, "cyst"),
  make_top_df(top50_norev, "norev")
)

protein_key_top50 <- protein_key %>%
  inner_join(top50_genes_df, by = "protein_id")

protein_key_top50_unknown <- protein_key_top50 %>%
  filter(Protein_Name == "\"Protein_of_unknown_function\"")
length(intersect(rownames(top10_cyst), rownames(top50_cyst)))   # should be 10
## [1] 10
length(intersect(rownames(top10_norev), rownames(top50_norev))) # should be 10
## [1] 10

create fasta for blast

clean_ids <- sub(" .*", "", names(seqs))

names(seqs) <- clean_ids

subset_seqs <- seqs[names(seqs) %in% protein_key_top50_unknown$protein_id]

length(subset_seqs)
## [1] 46
setdiff(protein_key_top50_unknown$protein_id, names(seqs))
## character(0)
writeXStringSet(subset_seqs, "unknown_proteins_for_blast.fasta")

HEATMAP heatmap for fun

top_genes <- c(rownames(top10_cyst), rownames(top10_norev))

vsd <- vst(dds, blind = FALSE)
mat <- assay(vsd)

mat_subset <- mat[top_genes, ]

annotation_col <- as.data.frame(colData(dds)[, "Stage", drop = FALSE])
rownames(annotation_col) <- colnames(mat_subset)


library(pheatmap)

pheatmap(mat_subset,
         scale = "row",
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation",
         annotation_col = annotation_col)

ann_colors <- list(
  Stage = c(
    Cyst = "red",
    NoReversion = "blue"
  )
)

pheatmap(mat_subset,
         scale = "row",
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation",
         annotation_col = annotation_col,
         annotation_colors = ann_colors)

sig_genes <- rownames(sig)

mat_sig_subset <- mat[sig_genes, ]

pheatmap(mat_sig_subset,
         scale = "row",
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation")