Load necessary packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
##
## Attaching package: 'S4Vectors'
##
## The following objects are masked from 'package:lubridate':
##
## second, second<-
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following object is masked from 'package:utils':
##
## findMatches
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:lubridate':
##
## %within%
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:purrr':
##
## reduce
##
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
##
## The following object is masked from 'package:dplyr':
##
## count
##
##
## Attaching package: 'MatrixGenerics'
##
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
##
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'Biobase'
##
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
##
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(EnhancedVolcano)
## Loading required package: ggrepel
setwd("/stor/work/FRI_321G_JD_Spring2026/Spring_Projects/group_Ambystoma/")
readcounts <- read.csv("READCOUNTS/Tur_doh.readcounts.csv", row.names = 1)
protein_key <- read.csv("READCOUNTS/Tur_doh.proteinkey.csv", row.names = 1)
protein_key$protein_id <- rownames(protein_key)
metadata <- read.csv("PRJNA734867/metadata.csv")
head(readcounts)
## SRR16148333 SRR16148332
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 93 228
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 40 43
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 180051 111525
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 87 96
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 268 204
## SRR16148319 SRR16148331
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 1556 16837
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 223 2344
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 122741 199160
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 1
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 130 393
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 243 887
## SRR16148318 SRR16148337
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 1608 148
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 337 30
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 195020 133350
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 2 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 218 92
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 382 155
## SRR16148341 SRR16148336
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 1028 1697
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 273 159
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 187224 71085
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 304 45
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 1184 98
## SRR16148340 SRR16148335
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 4043 1216
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 497 309
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 181245 166210
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 399 179
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 517 215
## SRR16148343 SRR16148334
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 4404 158
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 860 23
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 190028 170543
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 128 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 733 76
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 663 244
## SRR16148342 SRR16148312
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 5181 723
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 663 165
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 186605 386118
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 1
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 179 120
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 294 297
## SRR16148338 SRR16148310
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 3917 4687
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 305 391
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 104176 273121
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 153 179
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 388 424
## SRR16148339 SRR16148311
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 2193 232
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 800 74
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 229214 196845
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 563 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 202 143
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 344 477
## SRR16148307 SRR16148306
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 288 22121
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 222 1730
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 304926 150723
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 429 400
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 780 958
## SRR16148304 SRR16148303
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 5141 2930
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 476 307
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 454420 338876
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 261 184
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 230 202
## SRR16148302 SRR16148301
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 10755 1786
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 1241 265
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 466866 399573
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 465 216
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 582 201
## SRR16148329 SRR16148300
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 316 8518
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 123 853
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 150581 355441
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 1 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 129 341
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 328 394
## SRR16148328 SRR16148326
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 177 1500
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 47 383
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 120540 186114
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 1 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 82 326
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 204 402
## SRR16148327 SRR16148324
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 779 1195
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 331 421
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 177913 181055
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 269 267
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 321 413
## SRR16148325 SRR16148322
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 2459 823
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 670 155
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 278491 221905
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 168 12
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 215 225
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 528 174
## SRR16148323 SRR16148320
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 1820 5393
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 435 797
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 211635 303372
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 379 190
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 309 571
## SRR16148308 SRR16148309
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 2669 11084
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 222 1503
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 22020 473989
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 0 0
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 24 224
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 102 816
head(protein_key)
## Protein_Name
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 "Similar_to_RNR2B_Ribonucleoside-diphosphate_reductase_small_chain_B_(Arabidopsis_thaliana_OX=3702)"
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 "Similar_to_Rrm1_Ribonucleoside-diphosphate_reductase_large_subunit_(Mus_musculus_OX=10090)"
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 "Similar_to_HSPA8_Heat_shock_cognate_71_kDa_protein_(Pongo_abelii_OX=9601)"
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 "Similar_to_IIV6-467R_Uncharacterized_protein_467R_(Invertebrate_iridescent_virus_6_OX=176652)"
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 "Similar_to_Xrn1_5'-3'_exoribonuclease_1_(Mus_musculus_OX=10090)"
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 "Similar_to_RPO21_DNA-directed_RNA_polymerase_II_subunit_RPB1_(Saccharomyces_cerevisiae_(strain_ATCC_204508_/_S288c)_OX=559292)"
## protein_id
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 augustus_masked-scaffold1-processed-gene-0.13-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 augustus_masked-scaffold1-processed-gene-0.14-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 augustus_masked-scaffold1-processed-gene-0.19-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.28-mRNA-1 augustus_masked-scaffold1-processed-gene-0.28-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 augustus_masked-scaffold1-processed-gene-0.34-mRNA-1
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 augustus_masked-scaffold1-processed-gene-0.49-mRNA-1
Gene lengths for proteins:
library(Biostrings)
## Loading required package: XVector
##
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
##
## compact
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
# read fasta
seqs <- readAAStringSet("Reference/dornii.putative_func.proteins.fasta")
# get lengths
lengths <- width(seqs)
# get names (headers)
names <- names(seqs)
# make a dataframe
fasta_df <- data.frame(
protein_id = names,
length = lengths,
stringsAsFactors = FALSE
)
fasta_df$protein_id <- sub(" .*", "", fasta_df$protein_id)
protein_key <- merge(protein_key, fasta_df, by = "protein_id", all.x = TRUE)
colnames(readcounts)
## [1] "SRR16148333" "SRR16148332" "SRR16148319" "SRR16148331" "SRR16148318"
## [6] "SRR16148337" "SRR16148341" "SRR16148336" "SRR16148340" "SRR16148335"
## [11] "SRR16148343" "SRR16148334" "SRR16148342" "SRR16148312" "SRR16148338"
## [16] "SRR16148310" "SRR16148339" "SRR16148311" "SRR16148307" "SRR16148306"
## [21] "SRR16148304" "SRR16148303" "SRR16148302" "SRR16148301" "SRR16148329"
## [26] "SRR16148300" "SRR16148328" "SRR16148326" "SRR16148327" "SRR16148324"
## [31] "SRR16148325" "SRR16148322" "SRR16148323" "SRR16148320" "SRR16148308"
## [36] "SRR16148309"
metadata$RunID
## [1] "SRR16148300" "SRR16148301" "SRR16148302" "SRR16148303" "SRR16148304"
## [6] "SRR16148305" "SRR16148306" "SRR16148307" "SRR16148308" "SRR16148309"
## [11] "SRR16148310" "SRR16148311" "SRR16148312" "SRR16148313" "SRR16148314"
## [16] "SRR16148315" "SRR16148316" "SRR16148317" "SRR16148318" "SRR16148319"
## [21] "SRR16148320" "SRR16148321" "SRR16148322" "SRR16148323" "SRR16148324"
## [26] "SRR16148325" "SRR16148326" "SRR16148327" "SRR16148328" "SRR16148329"
## [31] "SRR16148330" "SRR16148331" "SRR16148332" "SRR16148333" "SRR16148334"
## [36] "SRR16148335" "SRR16148336" "SRR16148337" "SRR16148338" "SRR16148339"
## [41] "SRR16148340" "SRR16148341" "SRR16148342" "SRR16148343"
# reorder metadata to match count matrix
metadata <- metadata[match(colnames(readcounts), metadata$RunID), ]
# check
all(colnames(readcounts) == metadata$RunID)
## [1] TRUE
subset_idx <- metadata$Stage %in% c("Cyst", "NoReversion")
counts_subset <- readcounts[, subset_idx]
metadata_subset <- metadata[subset_idx, ]
metadata_subset$Stage <- factor(metadata_subset$Stage)
metadata_subset$Stage <- relevel(metadata_subset$Stage, ref = "NoReversion")
library(DESeq2)
dds <- DESeqDataSetFromMatrix(
countData = counts_subset,
colData = metadata_subset,
design = ~ Stage
)
dds <- dds[rowSums(counts(dds)) > 10, ]
vsd <- vst(dds, blind = TRUE)
plotPCA(vsd, intgroup = "Stage")
## using ntop=500 top features by variance
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DESeq2 package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
## [1] "Intercept" "Stage_Cyst_vs_NoReversion"
res <- results(dds, contrast = c("Stage", "Cyst", "NoReversion"))
head(res)
## log2 fold change (MLE): Stage Cyst vs NoReversion
## Wald test p-value: Stage Cyst vs NoReversion
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 3472.319 -2.5083145
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 322.211 -1.8176323
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 159604.923 0.3271777
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 125.629 -0.2876313
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 310.521 -0.5733679
## augustus_masked-scaffold1-processed-gene-0.9-mRNA-1 336.906 -0.0988602
## lfcSE stat
## <numeric> <numeric>
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 1.282313 -1.956086
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 0.821358 -2.212961
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 0.329998 0.991453
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 0.379722 -0.757478
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 0.387414 -1.479986
## augustus_masked-scaffold1-processed-gene-0.9-mRNA-1 0.318229 -0.310657
## pvalue padj
## <numeric> <numeric>
## augustus_masked-scaffold1-processed-gene-0.13-mRNA-1 0.0504551 0.175170
## augustus_masked-scaffold1-processed-gene-0.14-mRNA-1 0.0269004 0.113770
## augustus_masked-scaffold1-processed-gene-0.19-mRNA-1 0.3214644 0.566511
## augustus_masked-scaffold1-processed-gene-0.34-mRNA-1 0.4487635 0.682256
## augustus_masked-scaffold1-processed-gene-0.49-mRNA-1 0.1388769 0.338906
## augustus_masked-scaffold1-processed-gene-0.9-mRNA-1 0.7560613 0.885973
res <- res[order(res$padj), ]
# remove NA rows first
res_clean <- res[!is.na(res$padj), ]
# then filter significant
sig <- res_clean[res_clean$padj < 0.05, ]
head(sig)
## log2 fold change (MLE): Stage Cyst vs NoReversion
## Wald test p-value: Stage Cyst vs NoReversion
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## maker-scaffold2631-augustus-gene-0.0-mRNA-1 69082.72 8.48994
## maker-scaffold8088-augustus-gene-0.2-mRNA-1 97746.43 9.81823
## maker-scaffold1293-augustus-gene-0.2-mRNA-1 2826.72 4.31533
## augustus_masked-scaffold1365-processed-gene-0.2-mRNA-1 4450.31 -7.32616
## augustus_masked-scaffold909-processed-gene-0.2-mRNA-1 13443.05 -6.03907
## augustus_masked-scaffold3182-processed-gene-0.3-mRNA-1 2356.72 -5.90921
## lfcSE stat
## <numeric> <numeric>
## maker-scaffold2631-augustus-gene-0.0-mRNA-1 0.382587 22.1909
## maker-scaffold8088-augustus-gene-0.2-mRNA-1 0.540952 18.1499
## maker-scaffold1293-augustus-gene-0.2-mRNA-1 0.256636 16.8150
## augustus_masked-scaffold1365-processed-gene-0.2-mRNA-1 0.489543 -14.9653
## augustus_masked-scaffold909-processed-gene-0.2-mRNA-1 0.408097 -14.7981
## augustus_masked-scaffold3182-processed-gene-0.3-mRNA-1 0.415576 -14.2193
## pvalue
## <numeric>
## maker-scaffold2631-augustus-gene-0.0-mRNA-1 4.20722e-109
## maker-scaffold8088-augustus-gene-0.2-mRNA-1 1.28583e-73
## maker-scaffold1293-augustus-gene-0.2-mRNA-1 1.89457e-63
## augustus_masked-scaffold1365-processed-gene-0.2-mRNA-1 1.23763e-50
## augustus_masked-scaffold909-processed-gene-0.2-mRNA-1 1.50619e-49
## augustus_masked-scaffold3182-processed-gene-0.3-mRNA-1 6.95073e-46
## padj
## <numeric>
## maker-scaffold2631-augustus-gene-0.0-mRNA-1 7.29364e-105
## maker-scaffold8088-augustus-gene-0.2-mRNA-1 1.11456e-69
## maker-scaffold1293-augustus-gene-0.2-mRNA-1 1.09481e-59
## augustus_masked-scaffold1365-processed-gene-0.2-mRNA-1 5.36390e-47
## augustus_masked-scaffold909-processed-gene-0.2-mRNA-1 5.22228e-46
## augustus_masked-scaffold3182-processed-gene-0.3-mRNA-1 2.00830e-42
DE_results <- as.data.frame(res_clean)
library(EnhancedVolcano)
EnhancedVolcano(DE_results,
lab = rownames(DE_results),
x = 'log2FoldChange',
y = 'padj',
title = 'Differential Gene Expression: Cyst vs NoReversion',
subtitle = 'Positive log2FC = upregulated in Cyst; Negative log2FC = upregulated in NoReversion',
xlab = 'Log2 Fold Change (Cyst / NoReversion)',
ylab = '-Log10 Adjusted P-value',
pCutoff = 0.05,
FCcutoff = 1,
pointSize = 2.0,
labSize = 3.0
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the EnhancedVolcano package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the EnhancedVolcano package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
top 10 differentially expressed for Cyst and NoRev stages
library(stringr)
library(dplyr)
res_clean <- as.data.frame(res_clean)
res_clean$gene_id <- sub("-mRNA-.*$", "", rownames(res_clean))
res_clean$gene_id <- sub("\\.[0-9]+$", "", res_clean$gene_id)
res_cyst <- res_clean %>%
filter(log2FoldChange > 0, padj < 0.05) %>%
arrange(desc(log2FoldChange)) %>%
distinct(gene_id, .keep_all = TRUE)
top10_cyst <- head(res_cyst[order(-res_cyst$log2FoldChange), ], 10)
top10_cyst <- as.data.frame(top10_cyst)
res_norev <- res_clean %>%
filter(log2FoldChange < 0, padj < 0.05) %>%
arrange(log2FoldChange) %>% # most negative first
distinct(gene_id, .keep_all = TRUE)
top10_norev <- head(res_norev[order(res_norev$log2FoldChange), ], 10)
top10_norev = as.data.frame(top10_norev)
top_genes <- c(rownames(top10_cyst), rownames(top10_norev))
top10_cyst_df <- data.frame(
protein_id = rownames(top10_cyst),
group = "cyst"
)
top10_norev_df <- data.frame(
protein_id = rownames(top10_norev),
group = "norev"
)
top_genes_df <- rbind(top10_cyst_df, top10_norev_df)
protein_key_top <- protein_key %>%
inner_join(top_genes_df, by = "protein_id")
protein_key_top_unknown <- protein_key_top %>% filter(Protein_Name == "\"Protein_of_unknown_function\"")
also acquire the top 50 for each so we can filter out uncharacterized proteins
top50_cyst <- head(res_cyst[order(-res_cyst$log2FoldChange), ], 50)
top50_cyst <- as.data.frame(top50_cyst)
top50_norev <- head(res_norev[order(res_norev$log2FoldChange), ], 50)
top50_norev <- as.data.frame(top50_norev)
make_top_df <- function(df, group_name) {
data.frame(
protein_id = rownames(df),
group = group_name
)
}
top50_genes_df <- rbind(
make_top_df(top50_cyst, "cyst"),
make_top_df(top50_norev, "norev")
)
protein_key_top50 <- protein_key %>%
inner_join(top50_genes_df, by = "protein_id")
protein_key_top50_unknown <- protein_key_top50 %>%
filter(Protein_Name == "\"Protein_of_unknown_function\"")
length(intersect(rownames(top10_cyst), rownames(top50_cyst))) # should be 10
## [1] 10
length(intersect(rownames(top10_norev), rownames(top50_norev))) # should be 10
## [1] 10
create fasta for blast
clean_ids <- sub(" .*", "", names(seqs))
names(seqs) <- clean_ids
subset_seqs <- seqs[names(seqs) %in% protein_key_top50_unknown$protein_id]
length(subset_seqs)
## [1] 46
setdiff(protein_key_top50_unknown$protein_id, names(seqs))
## character(0)
writeXStringSet(subset_seqs, "unknown_proteins_for_blast.fasta")
HEATMAP heatmap for fun
top_genes <- c(rownames(top10_cyst), rownames(top10_norev))
vsd <- vst(dds, blind = FALSE)
mat <- assay(vsd)
mat_subset <- mat[top_genes, ]
annotation_col <- as.data.frame(colData(dds)[, "Stage", drop = FALSE])
rownames(annotation_col) <- colnames(mat_subset)
library(pheatmap)
pheatmap(mat_subset,
scale = "row",
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
annotation_col = annotation_col)
ann_colors <- list(
Stage = c(
Cyst = "red",
NoReversion = "blue"
)
)
pheatmap(mat_subset,
scale = "row",
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
annotation_col = annotation_col,
annotation_colors = ann_colors)
sig_genes <- rownames(sig)
mat_sig_subset <- mat[sig_genes, ]
pheatmap(mat_sig_subset,
scale = "row",
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation")